mirror of
https://github.com/c-cube/ocaml-containers.git
synced 2025-12-06 03:05:28 -05:00
add a special case for pattern of length 1 in CCString.find
This commit is contained in:
parent
ab0b198f97
commit
ce6d981973
2 changed files with 77 additions and 42 deletions
|
|
@ -1158,7 +1158,7 @@ module Str = struct
|
|||
(* random string, but always returns the same for a given size *)
|
||||
let rand_str_ ?(among="abcdefgh") n =
|
||||
let module Q = Quickcheck in
|
||||
let st = Random.State.make [| n |] in
|
||||
let st = Random.State.make [| n + 17 |] in
|
||||
let gen_c = Q.Gen.oneofl (CCString.to_list among) in
|
||||
Q.Gen.string_size ~gen:gen_c (Q.Gen.return n) st
|
||||
|
||||
|
|
@ -1213,11 +1213,15 @@ module Str = struct
|
|||
and mk_current = match dir with
|
||||
| `Direct -> fun () -> CCString.find ~sub:needle haystack
|
||||
| `Reverse -> fun () -> CCString.rfind ~sub:needle haystack
|
||||
and mk_current_compiled = match dir with
|
||||
| `Direct -> let f = CCString.find ~start:0 ~sub:needle in fun () -> f haystack
|
||||
| `Reverse -> let f = CCString.rfind ~sub:needle in fun () -> f haystack
|
||||
in
|
||||
assert (mk_naive () = mk_current ());
|
||||
B.throughputN 3 ~repeat
|
||||
[ "naive", mk_naive, ()
|
||||
; "current", mk_current, ()
|
||||
; "current_compiled", mk_current_compiled, ()
|
||||
]
|
||||
|
||||
(* benchmark String.find_all *)
|
||||
|
|
@ -1226,11 +1230,14 @@ module Str = struct
|
|||
let haystack = rand_str_ n in
|
||||
pp_pb needle haystack;
|
||||
let mk_naive () = find_all_l ~sub:needle haystack
|
||||
and mk_current () = CCString.find_all_l ~sub:needle haystack in
|
||||
and mk_current () = CCString.find_all_l ~sub:needle haystack
|
||||
and mk_current_compiled =
|
||||
let f = CCString.find_all_l ~start:0 ~sub:needle in fun () -> f haystack in
|
||||
assert (mk_naive () = mk_current ());
|
||||
B.throughputN 3 ~repeat
|
||||
[ "naive", mk_naive, ()
|
||||
; "current", mk_current, ()
|
||||
; "current_compiled", mk_current_compiled, ()
|
||||
]
|
||||
|
||||
(* benchmark String.find_all on constant strings *)
|
||||
|
|
@ -1252,8 +1259,7 @@ module Str = struct
|
|||
let () = B.Tree.register (
|
||||
"string" @>>>
|
||||
[ "find" @>>>
|
||||
[ "1" @>> app_ints (bench_find ~size:1) [100; 100_000; 500_000]
|
||||
; "3" @>> app_ints (bench_find ~size:3) [100; 100_000; 500_000]
|
||||
[ "3" @>> app_ints (bench_find ~size:3) [100; 100_000; 500_000]
|
||||
; "5" @>> app_ints (bench_find ~size:5) [100; 100_000; 500_000]
|
||||
; "15" @>> app_ints (bench_find ~size:15) [100; 100_000; 500_000]
|
||||
; "50" @>> app_ints (bench_find ~size:50) [100; 100_000; 500_000]
|
||||
|
|
|
|||
|
|
@ -75,15 +75,15 @@ type _ direction =
|
|||
| Reverse : [`Reverse] direction
|
||||
|
||||
(* we follow https://en.wikipedia.org/wiki/Knuth–Morris–Pratt_algorithm *)
|
||||
module KMP = struct
|
||||
type 'a pattern = {
|
||||
module Find = struct
|
||||
type 'a kmp_pattern = {
|
||||
failure : int array;
|
||||
str : string;
|
||||
}
|
||||
(* invariant: [length failure = length str].
|
||||
We use a phantom type to avoid mixing the directions. *)
|
||||
|
||||
let pattern_length p = String.length p.str
|
||||
let kmp_pattern_length p = String.length p.str
|
||||
|
||||
(* access the [i]-th element of [s] according to direction [dir] *)
|
||||
let get_
|
||||
|
|
@ -92,8 +92,8 @@ module KMP = struct
|
|||
| Direct -> String.get
|
||||
| Reverse -> (fun s i -> s.[String.length s - i - 1])
|
||||
|
||||
let compile_
|
||||
: type a. dir:a direction -> string -> a pattern
|
||||
let kmp_compile_
|
||||
: type a. dir:a direction -> string -> a kmp_pattern
|
||||
= fun ~dir str ->
|
||||
let len = length str in
|
||||
let get = get_ ~dir in (* how to read elements of the string *)
|
||||
|
|
@ -127,21 +127,21 @@ module KMP = struct
|
|||
(* Format.printf "{@[failure:%a, str:%s@]}@." CCFormat.(array int) failure str; *)
|
||||
{ failure; str; }
|
||||
|
||||
let compile s = compile_ ~dir:Direct s
|
||||
let rcompile s = compile_ ~dir:Reverse s
|
||||
let kmp_compile s = kmp_compile_ ~dir:Direct s
|
||||
let kmp_rcompile s = kmp_compile_ ~dir:Reverse s
|
||||
|
||||
(* proper search function.
|
||||
[i] index in [s]
|
||||
[j] index in [pattern]
|
||||
[len] length of [s] *)
|
||||
let find_
|
||||
: type a. dir:a direction -> pattern:a pattern -> string -> int -> int
|
||||
let kmp_find_
|
||||
: type a. dir:a direction -> pattern:a kmp_pattern -> string -> int -> int
|
||||
= fun ~dir ~pattern s idx ->
|
||||
let len = length s in
|
||||
let get = get_ ~dir in
|
||||
let i = ref idx in
|
||||
let j = ref 0 in
|
||||
let pat_len = pattern_length pattern in
|
||||
let pat_len = kmp_pattern_length pattern in
|
||||
while !j < pat_len && !i + !j < len do
|
||||
let c = get s (!i + !j) in
|
||||
let expected = get pattern.str !j in
|
||||
|
|
@ -168,34 +168,63 @@ module KMP = struct
|
|||
then !i
|
||||
else -1
|
||||
|
||||
let find ~pattern s i = find_ ~dir:Direct ~pattern s i
|
||||
let kmp_find ~pattern s i = kmp_find_ ~dir:Direct ~pattern s i
|
||||
|
||||
let rfind ~pattern s i =
|
||||
let kmp_rfind ~pattern s i =
|
||||
let i = String.length s - i - 1 in
|
||||
let res = find_ ~dir:Reverse ~pattern s i in
|
||||
let res = kmp_find_ ~dir:Reverse ~pattern s i in
|
||||
(* adjust result: first, [res = string.length s - res -1] to convert
|
||||
back to real indices; then, what we got is actually the position
|
||||
of the end of the pattern, so we subtract the [length of the pattern -1]
|
||||
to obtain the real result. *)
|
||||
if res = ~-1
|
||||
then res
|
||||
else (String.length s - res) - pattern_length pattern
|
||||
else (String.length s - res) - kmp_pattern_length pattern
|
||||
|
||||
type 'a pattern =
|
||||
| P_char of char
|
||||
| P_KMP of 'a kmp_pattern
|
||||
|
||||
let pattern_length = function
|
||||
| P_char _ -> 1
|
||||
| P_KMP p -> kmp_pattern_length p
|
||||
|
||||
let compile ~sub : [`Direct] pattern =
|
||||
if length sub=1
|
||||
then P_char sub.[0]
|
||||
else P_KMP (kmp_compile sub)
|
||||
|
||||
let rcompile ~sub : [`Reverse] pattern =
|
||||
if length sub=1
|
||||
then P_char sub.[0]
|
||||
else P_KMP (kmp_rcompile sub)
|
||||
|
||||
let find ~pattern s start = match pattern with
|
||||
| P_char c ->
|
||||
(try String.index_from s start c with Not_found -> -1)
|
||||
| P_KMP pattern -> kmp_find ~pattern s start
|
||||
|
||||
let rfind ~pattern s start = match pattern with
|
||||
| P_char c ->
|
||||
(try String.rindex_from s start c with Not_found -> -1)
|
||||
| P_KMP pattern -> kmp_rfind ~pattern s start
|
||||
end
|
||||
|
||||
let find ?(start=0) ~sub s =
|
||||
let pattern = KMP.compile sub in
|
||||
KMP.find ~pattern s start
|
||||
let find ?(start=0) ~sub =
|
||||
let pattern = Find.compile ~sub in
|
||||
fun s -> Find.find ~pattern s start
|
||||
|
||||
let find_all ?(start=0) ~sub s =
|
||||
let pattern = KMP.compile sub in
|
||||
let i = ref start in
|
||||
fun () ->
|
||||
let res = KMP.find ~pattern s !i in
|
||||
if res = ~-1 then None
|
||||
else (
|
||||
i := res + KMP.pattern_length pattern;
|
||||
Some res
|
||||
)
|
||||
let find_all ?(start=0) ~sub =
|
||||
let pattern = Find.compile ~sub in
|
||||
fun s ->
|
||||
let i = ref start in
|
||||
fun () ->
|
||||
let res = Find.find ~pattern s !i in
|
||||
if res = ~-1 then None
|
||||
else (
|
||||
i := res + Find.pattern_length pattern;
|
||||
Some res
|
||||
)
|
||||
|
||||
let find_all_l ?start ~sub s =
|
||||
let rec aux acc g = match g () with
|
||||
|
|
@ -206,9 +235,9 @@ let find_all_l ?start ~sub s =
|
|||
|
||||
let mem ?start ~sub s = find ?start ~sub s >= 0
|
||||
|
||||
let rfind ~sub s =
|
||||
let pattern = KMP.rcompile sub in
|
||||
KMP.rfind ~pattern s (String.length s-1)
|
||||
let rfind ~sub =
|
||||
let pattern = Find.rcompile ~sub in
|
||||
fun s -> Find.rfind ~pattern s (String.length s-1)
|
||||
|
||||
(* Replace substring [s.[pos]....s.[pos+len-1]] by [by] in [s] *)
|
||||
let replace_at_ ~pos ~len ~by s =
|
||||
|
|
@ -229,11 +258,11 @@ let replace ?(which=`All) ~sub ~by s =
|
|||
if i>=0 then replace_at_ ~pos:i ~len:(String.length sub) ~by s else s
|
||||
| `All ->
|
||||
(* compile search pattern only once *)
|
||||
let pattern = KMP.compile sub in
|
||||
let pattern = Find.compile ~sub in
|
||||
let b = Buffer.create (String.length s) in
|
||||
let start = ref 0 in
|
||||
while !start < String.length s do
|
||||
let i = KMP.find ~pattern s !start in
|
||||
let i = Find.find ~pattern s !start in
|
||||
if i>=0 then (
|
||||
(* between last and cur occurrences *)
|
||||
Buffer.add_substring b s !start (i- !start);
|
||||
|
|
@ -256,16 +285,16 @@ module Split = struct
|
|||
| SplitStop -> None
|
||||
| SplitAt prev -> _split_search ~by s prev
|
||||
and _split_search ~by s prev =
|
||||
let j = KMP.find ~pattern:by s prev in
|
||||
let j = Find.find ~pattern:by s prev in
|
||||
if j < 0
|
||||
then Some (SplitStop, prev, String.length s - prev)
|
||||
else Some (SplitAt (j+KMP.pattern_length by), prev, j-prev)
|
||||
else Some (SplitAt (j+Find.pattern_length by), prev, j-prev)
|
||||
|
||||
let _tuple3 x y z = x,y,z
|
||||
|
||||
let _mkgen ~by s k =
|
||||
let state = ref (SplitAt 0) in
|
||||
let by = KMP.compile by in
|
||||
let by = Find.compile ~sub:by in
|
||||
fun () ->
|
||||
match _split ~by s !state with
|
||||
| None -> None
|
||||
|
|
@ -278,7 +307,7 @@ module Split = struct
|
|||
let gen_cpy ~by s = _mkgen ~by s String.sub
|
||||
|
||||
let _mklist ~by s k =
|
||||
let by = KMP.compile by in
|
||||
let by = Find.compile ~sub:by in
|
||||
let rec build acc state = match _split ~by s state with
|
||||
| None -> List.rev acc
|
||||
| Some (state', i, len) ->
|
||||
|
|
@ -291,7 +320,7 @@ module Split = struct
|
|||
let list_cpy ~by s = _mklist ~by s String.sub
|
||||
|
||||
let _mkklist ~by s k =
|
||||
let by = KMP.compile by in
|
||||
let by = Find.compile ~sub:by in
|
||||
let rec make state () = match _split ~by s state with
|
||||
| None -> `Nil
|
||||
| Some (state', i, len) ->
|
||||
|
|
@ -303,7 +332,7 @@ module Split = struct
|
|||
let klist_cpy ~by s = _mkklist ~by s String.sub
|
||||
|
||||
let _mkseq ~by s f k =
|
||||
let by = KMP.compile by in
|
||||
let by = Find.compile ~sub:by in
|
||||
let rec aux state = match _split ~by s state with
|
||||
| None -> ()
|
||||
| Some (state', i, len) -> k (f s i len); aux state'
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue