mirror of
https://github.com/c-cube/ocaml-containers.git
synced 2025-12-09 20:55:31 -05:00
in CCString, use KMP for faster sub-string search; add find_all{,_l}
This commit is contained in:
parent
9c338f193e
commit
5f188c4f7e
2 changed files with 165 additions and 32 deletions
|
|
@ -70,32 +70,135 @@ let is_sub ~sub i s j ~len =
|
||||||
if i+len > String.length sub then invalid_arg "CCString.is_sub";
|
if i+len > String.length sub then invalid_arg "CCString.is_sub";
|
||||||
_is_sub ~sub i s j ~len
|
_is_sub ~sub i s j ~len
|
||||||
|
|
||||||
(* note: inefficient *)
|
type _ direction =
|
||||||
let find ?(start=0) ~sub s =
|
| Direct : [`Direct] direction
|
||||||
let n = String.length sub in
|
| Reverse : [`Reverse] direction
|
||||||
let i = ref start in
|
|
||||||
try
|
module KMP = struct
|
||||||
while !i + n <= String.length s do
|
type 'a pattern = {
|
||||||
if _is_sub ~sub 0 s !i ~len:n then raise Exit;
|
failure : int array;
|
||||||
incr i
|
str : string;
|
||||||
|
}
|
||||||
|
(* invariant: [length failure = length str].
|
||||||
|
We use a phantom type to avoid mixing the directions. *)
|
||||||
|
|
||||||
|
let pattern_length p = String.length p.str
|
||||||
|
|
||||||
|
(* access the [i]-th element of [s] according to direction [dir] *)
|
||||||
|
let get_
|
||||||
|
: type a. dir:a direction -> string -> int -> char
|
||||||
|
= fun ~dir -> match dir with
|
||||||
|
| Direct -> (fun s i -> s.[i])
|
||||||
|
| Reverse -> (fun s i -> s.[String.length s - i - 1])
|
||||||
|
|
||||||
|
let compile_
|
||||||
|
: type a. dir:a direction -> string -> a pattern
|
||||||
|
= fun ~dir str ->
|
||||||
|
let len = length str in
|
||||||
|
let get = get_ ~dir in (* how to read elements of the string *)
|
||||||
|
match len with
|
||||||
|
| 0 -> {failure=[| |]; str;}
|
||||||
|
| 1 -> {failure=[| -1 |]; str;}
|
||||||
|
| _ ->
|
||||||
|
(* at least 2 elements, the algorithm can work *)
|
||||||
|
let failure = Array.make len 0 in
|
||||||
|
(* i: current index in str *)
|
||||||
|
let i = ref 1 in
|
||||||
|
(* j: index of candidate substring *)
|
||||||
|
let j = ref 0 in
|
||||||
|
while !i < len-1 do
|
||||||
|
match !j with
|
||||||
|
| _ when get str !i = get str !j ->
|
||||||
|
(* substring starting at !j continues matching current char *)
|
||||||
|
i := !i+1;
|
||||||
|
j := !j+1;
|
||||||
|
failure.(!i) <- !j;
|
||||||
|
| 0 ->
|
||||||
|
(* back to the beginning *)
|
||||||
|
i := !i+1;
|
||||||
|
failure.(!i) <- 0;
|
||||||
|
| _ ->
|
||||||
|
(* fallback for the prefix string *)
|
||||||
|
assert (!j > 0);
|
||||||
|
j := failure.(!j)
|
||||||
|
done;
|
||||||
|
{ failure; str; }
|
||||||
|
|
||||||
|
let compile s = compile_ ~dir:Direct s
|
||||||
|
let rcompile s = compile_ ~dir:Reverse s
|
||||||
|
|
||||||
|
(* proper search function.
|
||||||
|
[i] index in [s]
|
||||||
|
[j] index in [pattern]
|
||||||
|
[len] length of [s] *)
|
||||||
|
let find_
|
||||||
|
: type a. dir:a direction -> pattern:a pattern -> string -> int -> int
|
||||||
|
= fun ~dir ~pattern s idx ->
|
||||||
|
let len = length s in
|
||||||
|
let get = get_ ~dir in
|
||||||
|
let i = ref idx in
|
||||||
|
let j = ref 0 in
|
||||||
|
let pat_len = pattern_length pattern in
|
||||||
|
while !i < len && !j < pat_len do
|
||||||
|
let c = get s !i in
|
||||||
|
let expected = get pattern.str !j in
|
||||||
|
if c = expected
|
||||||
|
then (
|
||||||
|
(* char matches *)
|
||||||
|
i := !i + 1; j := !j + 1
|
||||||
|
) else (
|
||||||
|
if !j=0
|
||||||
|
then (* beginning of the pattern *)
|
||||||
|
i := !i + 1
|
||||||
|
else (* follow the failure link *)
|
||||||
|
j := pattern.failure.(!j)
|
||||||
|
)
|
||||||
done;
|
done;
|
||||||
-1
|
if !j = pat_len
|
||||||
with Exit ->
|
then !i - pat_len
|
||||||
!i
|
else -1
|
||||||
|
|
||||||
|
let find ~pattern s i = find_ ~dir:Direct ~pattern s i
|
||||||
|
|
||||||
|
let rfind ~pattern s i =
|
||||||
|
let i = String.length s - i - 1 in
|
||||||
|
let res = find_ ~dir:Reverse ~pattern s i in
|
||||||
|
(* adjust result: first, [res = string.length s - res -1] to convert
|
||||||
|
back to real indices; then, what we got is actually the position
|
||||||
|
of the end of the pattern, so we subtract the [length of the pattern -1]
|
||||||
|
to obtain the real result. *)
|
||||||
|
if res = ~-1
|
||||||
|
then res
|
||||||
|
else (String.length s - res) - pattern_length pattern
|
||||||
|
end
|
||||||
|
|
||||||
|
let find ?(start=0) ~sub s =
|
||||||
|
let pattern = KMP.compile sub in
|
||||||
|
KMP.find ~pattern s start
|
||||||
|
|
||||||
|
let find_all ?(start=0) ~sub s =
|
||||||
|
let pattern = KMP.compile sub in
|
||||||
|
let i = ref start in
|
||||||
|
fun () ->
|
||||||
|
let res = KMP.find ~pattern s !i in
|
||||||
|
if res = ~-1 then None
|
||||||
|
else (
|
||||||
|
i := res + KMP.pattern_length pattern;
|
||||||
|
Some res
|
||||||
|
)
|
||||||
|
|
||||||
|
let find_all_l ?start ~sub s =
|
||||||
|
let rec aux acc g = match g () with
|
||||||
|
| None -> List.rev acc
|
||||||
|
| Some i -> aux (i::acc) g
|
||||||
|
in
|
||||||
|
aux [] (find_all ?start ~sub s)
|
||||||
|
|
||||||
let mem ?start ~sub s = find ?start ~sub s >= 0
|
let mem ?start ~sub s = find ?start ~sub s >= 0
|
||||||
|
|
||||||
let rfind ~sub s =
|
let rfind ~sub s =
|
||||||
let n = String.length sub in
|
let pattern = KMP.rcompile sub in
|
||||||
let i = ref (String.length s - n) in
|
KMP.rfind ~pattern s (String.length s-1)
|
||||||
try
|
|
||||||
while !i >= 0 do
|
|
||||||
if _is_sub ~sub 0 s !i ~len:n then raise Exit;
|
|
||||||
decr i
|
|
||||||
done;
|
|
||||||
~-1
|
|
||||||
with Exit ->
|
|
||||||
!i
|
|
||||||
|
|
||||||
(* Replace substring [s.[pos]....s.[pos+len-1]] by [by] in [s] *)
|
(* Replace substring [s.[pos]....s.[pos+len-1]] by [by] in [s] *)
|
||||||
let replace_at_ ~pos ~len ~by s =
|
let replace_at_ ~pos ~len ~by s =
|
||||||
|
|
@ -109,16 +212,18 @@ let replace ?(which=`All) ~sub ~by s =
|
||||||
if sub="" then invalid_arg "CCString.replace";
|
if sub="" then invalid_arg "CCString.replace";
|
||||||
match which with
|
match which with
|
||||||
| `Left ->
|
| `Left ->
|
||||||
let i = find ~sub s in
|
let i = find ~sub s ~start:0 in
|
||||||
if i>=0 then replace_at_ ~pos:i ~len:(String.length sub) ~by s else s
|
if i>=0 then replace_at_ ~pos:i ~len:(String.length sub) ~by s else s
|
||||||
| `Right ->
|
| `Right ->
|
||||||
let i = rfind ~sub s in
|
let i = rfind ~sub s in
|
||||||
if i>=0 then replace_at_ ~pos:i ~len:(String.length sub) ~by s else s
|
if i>=0 then replace_at_ ~pos:i ~len:(String.length sub) ~by s else s
|
||||||
| `All ->
|
| `All ->
|
||||||
|
(* compile search pattern only once *)
|
||||||
|
let pattern = KMP.compile sub in
|
||||||
let b = Buffer.create (String.length s) in
|
let b = Buffer.create (String.length s) in
|
||||||
let start = ref 0 in
|
let start = ref 0 in
|
||||||
while !start < String.length s do
|
while !start < String.length s do
|
||||||
let i = find ~start:!start ~sub s in
|
let i = KMP.find ~pattern s !start in
|
||||||
if i>=0 then (
|
if i>=0 then (
|
||||||
(* between last and cur occurrences *)
|
(* between last and cur occurrences *)
|
||||||
Buffer.add_substring b s !start (i- !start);
|
Buffer.add_substring b s !start (i- !start);
|
||||||
|
|
|
||||||
|
|
@ -96,10 +96,32 @@ val find : ?start:int -> sub:string -> string -> int
|
||||||
Should only be used with very small [sub] *)
|
Should only be used with very small [sub] *)
|
||||||
|
|
||||||
(*$= & ~printer:string_of_int
|
(*$= & ~printer:string_of_int
|
||||||
(find ~sub:"bc" "abcd") 1
|
1 (find ~sub:"bc" "abcd")
|
||||||
(find ~sub:"bc" "abd") ~-1
|
~-1 (find ~sub:"bc" "abd")
|
||||||
(find ~sub:"a" "_a_a_a_") 1
|
1 (find ~sub:"a" "_a_a_a_")
|
||||||
(find ~sub:"a" ~start:5 "a1a234a") 6
|
6 (find ~sub:"a" ~start:5 "a1a234a")
|
||||||
|
*)
|
||||||
|
|
||||||
|
(*$Q & ~count:300
|
||||||
|
Q.(pair printable_string printable_string) (fun (s1,s2) -> \
|
||||||
|
let i = find ~sub:s2 s1 in \
|
||||||
|
i < 0 || String.sub s1 i (length s2) = s2)
|
||||||
|
*)
|
||||||
|
|
||||||
|
val find_all : ?start:int -> sub:string -> string -> int gen
|
||||||
|
(** [find_all ~sub s] finds all occurrences of [sub] in [s]
|
||||||
|
@param start starting position in [s]
|
||||||
|
@since NEXT_RELEASE *)
|
||||||
|
|
||||||
|
val find_all_l : ?start:int -> sub:string -> string -> int list
|
||||||
|
(** [find_all ~sub s] finds all occurrences of [sub] in [s] and returns
|
||||||
|
them in a list
|
||||||
|
@param start starting position in [s]
|
||||||
|
@since NEXT_RELEASE *)
|
||||||
|
|
||||||
|
(*$= & ~printer:Q.Print.(list int)
|
||||||
|
[1; 6] (find_all_l ~sub:"bc" "abc aabc aab")
|
||||||
|
[] (find_all_l ~sub:"bc" "abd")
|
||||||
*)
|
*)
|
||||||
|
|
||||||
val mem : ?start:int -> sub:string -> string -> bool
|
val mem : ?start:int -> sub:string -> string -> bool
|
||||||
|
|
@ -117,11 +139,17 @@ val rfind : sub:string -> string -> int
|
||||||
@since 0.12 *)
|
@since 0.12 *)
|
||||||
|
|
||||||
(*$= & ~printer:string_of_int
|
(*$= & ~printer:string_of_int
|
||||||
(rfind ~sub:"bc" "abcd") 1
|
1 (rfind ~sub:"bc" "abcd")
|
||||||
(rfind ~sub:"bc" "abd") ~-1
|
~-1 (rfind ~sub:"bc" "abd")
|
||||||
(rfind ~sub:"a" "_a_a_a_") 5
|
5 (rfind ~sub:"a" "_a_a_a_")
|
||||||
(rfind ~sub:"bc" "abcdbcd") 4
|
4 (rfind ~sub:"bc" "abcdbcd")
|
||||||
(rfind ~sub:"a" "a1a234a") 6
|
6 (rfind ~sub:"a" "a1a234a")
|
||||||
|
*)
|
||||||
|
|
||||||
|
(*$Q & ~count:300
|
||||||
|
Q.(pair printable_string printable_string) (fun (s1,s2) -> \
|
||||||
|
let i = rfind ~sub:s2 s1 in \
|
||||||
|
i < 0 || String.sub s1 i (length s2) = s2)
|
||||||
*)
|
*)
|
||||||
|
|
||||||
val replace : ?which:[`Left|`Right|`All] -> sub:string -> by:string -> string -> string
|
val replace : ?which:[`Left|`Right|`All] -> sub:string -> by:string -> string -> string
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue