add a special case for pattern of length 1 in CCString.find

This commit is contained in:
Simon Cruanes 2016-03-09 23:34:38 +01:00
parent ab0b198f97
commit ce6d981973
2 changed files with 77 additions and 42 deletions

View file

@ -1158,7 +1158,7 @@ module Str = struct
(* random string, but always returns the same for a given size *)
let rand_str_ ?(among="abcdefgh") n =
let module Q = Quickcheck in
let st = Random.State.make [| n |] in
let st = Random.State.make [| n + 17 |] in
let gen_c = Q.Gen.oneofl (CCString.to_list among) in
Q.Gen.string_size ~gen:gen_c (Q.Gen.return n) st
@ -1213,11 +1213,15 @@ module Str = struct
and mk_current = match dir with
| `Direct -> fun () -> CCString.find ~sub:needle haystack
| `Reverse -> fun () -> CCString.rfind ~sub:needle haystack
and mk_current_compiled = match dir with
| `Direct -> let f = CCString.find ~start:0 ~sub:needle in fun () -> f haystack
| `Reverse -> let f = CCString.rfind ~sub:needle in fun () -> f haystack
in
assert (mk_naive () = mk_current ());
B.throughputN 3 ~repeat
[ "naive", mk_naive, ()
; "current", mk_current, ()
; "current_compiled", mk_current_compiled, ()
]
(* benchmark String.find_all *)
@ -1226,11 +1230,14 @@ module Str = struct
let haystack = rand_str_ n in
pp_pb needle haystack;
let mk_naive () = find_all_l ~sub:needle haystack
and mk_current () = CCString.find_all_l ~sub:needle haystack in
and mk_current () = CCString.find_all_l ~sub:needle haystack
and mk_current_compiled =
let f = CCString.find_all_l ~start:0 ~sub:needle in fun () -> f haystack in
assert (mk_naive () = mk_current ());
B.throughputN 3 ~repeat
[ "naive", mk_naive, ()
; "current", mk_current, ()
; "current_compiled", mk_current_compiled, ()
]
(* benchmark String.find_all on constant strings *)
@ -1252,8 +1259,7 @@ module Str = struct
let () = B.Tree.register (
"string" @>>>
[ "find" @>>>
[ "1" @>> app_ints (bench_find ~size:1) [100; 100_000; 500_000]
; "3" @>> app_ints (bench_find ~size:3) [100; 100_000; 500_000]
[ "3" @>> app_ints (bench_find ~size:3) [100; 100_000; 500_000]
; "5" @>> app_ints (bench_find ~size:5) [100; 100_000; 500_000]
; "15" @>> app_ints (bench_find ~size:15) [100; 100_000; 500_000]
; "50" @>> app_ints (bench_find ~size:50) [100; 100_000; 500_000]

View file

@ -75,15 +75,15 @@ type _ direction =
| Reverse : [`Reverse] direction
(* we follow https://en.wikipedia.org/wiki/KnuthMorrisPratt_algorithm *)
module KMP = struct
type 'a pattern = {
module Find = struct
type 'a kmp_pattern = {
failure : int array;
str : string;
}
(* invariant: [length failure = length str].
We use a phantom type to avoid mixing the directions. *)
let pattern_length p = String.length p.str
let kmp_pattern_length p = String.length p.str
(* access the [i]-th element of [s] according to direction [dir] *)
let get_
@ -92,8 +92,8 @@ module KMP = struct
| Direct -> String.get
| Reverse -> (fun s i -> s.[String.length s - i - 1])
let compile_
: type a. dir:a direction -> string -> a pattern
let kmp_compile_
: type a. dir:a direction -> string -> a kmp_pattern
= fun ~dir str ->
let len = length str in
let get = get_ ~dir in (* how to read elements of the string *)
@ -127,21 +127,21 @@ module KMP = struct
(* Format.printf "{@[failure:%a, str:%s@]}@." CCFormat.(array int) failure str; *)
{ failure; str; }
let compile s = compile_ ~dir:Direct s
let rcompile s = compile_ ~dir:Reverse s
let kmp_compile s = kmp_compile_ ~dir:Direct s
let kmp_rcompile s = kmp_compile_ ~dir:Reverse s
(* proper search function.
[i] index in [s]
[j] index in [pattern]
[len] length of [s] *)
let find_
: type a. dir:a direction -> pattern:a pattern -> string -> int -> int
let kmp_find_
: type a. dir:a direction -> pattern:a kmp_pattern -> string -> int -> int
= fun ~dir ~pattern s idx ->
let len = length s in
let get = get_ ~dir in
let i = ref idx in
let j = ref 0 in
let pat_len = pattern_length pattern in
let pat_len = kmp_pattern_length pattern in
while !j < pat_len && !i + !j < len do
let c = get s (!i + !j) in
let expected = get pattern.str !j in
@ -168,34 +168,63 @@ module KMP = struct
then !i
else -1
let find ~pattern s i = find_ ~dir:Direct ~pattern s i
let kmp_find ~pattern s i = kmp_find_ ~dir:Direct ~pattern s i
let rfind ~pattern s i =
let kmp_rfind ~pattern s i =
let i = String.length s - i - 1 in
let res = find_ ~dir:Reverse ~pattern s i in
let res = kmp_find_ ~dir:Reverse ~pattern s i in
(* adjust result: first, [res = string.length s - res -1] to convert
back to real indices; then, what we got is actually the position
of the end of the pattern, so we subtract the [length of the pattern -1]
to obtain the real result. *)
if res = ~-1
then res
else (String.length s - res) - pattern_length pattern
else (String.length s - res) - kmp_pattern_length pattern
type 'a pattern =
| P_char of char
| P_KMP of 'a kmp_pattern
let pattern_length = function
| P_char _ -> 1
| P_KMP p -> kmp_pattern_length p
let compile ~sub : [`Direct] pattern =
if length sub=1
then P_char sub.[0]
else P_KMP (kmp_compile sub)
let rcompile ~sub : [`Reverse] pattern =
if length sub=1
then P_char sub.[0]
else P_KMP (kmp_rcompile sub)
let find ~pattern s start = match pattern with
| P_char c ->
(try String.index_from s start c with Not_found -> -1)
| P_KMP pattern -> kmp_find ~pattern s start
let rfind ~pattern s start = match pattern with
| P_char c ->
(try String.rindex_from s start c with Not_found -> -1)
| P_KMP pattern -> kmp_rfind ~pattern s start
end
let find ?(start=0) ~sub s =
let pattern = KMP.compile sub in
KMP.find ~pattern s start
let find ?(start=0) ~sub =
let pattern = Find.compile ~sub in
fun s -> Find.find ~pattern s start
let find_all ?(start=0) ~sub s =
let pattern = KMP.compile sub in
let i = ref start in
fun () ->
let res = KMP.find ~pattern s !i in
if res = ~-1 then None
else (
i := res + KMP.pattern_length pattern;
Some res
)
let find_all ?(start=0) ~sub =
let pattern = Find.compile ~sub in
fun s ->
let i = ref start in
fun () ->
let res = Find.find ~pattern s !i in
if res = ~-1 then None
else (
i := res + Find.pattern_length pattern;
Some res
)
let find_all_l ?start ~sub s =
let rec aux acc g = match g () with
@ -206,9 +235,9 @@ let find_all_l ?start ~sub s =
let mem ?start ~sub s = find ?start ~sub s >= 0
let rfind ~sub s =
let pattern = KMP.rcompile sub in
KMP.rfind ~pattern s (String.length s-1)
let rfind ~sub =
let pattern = Find.rcompile ~sub in
fun s -> Find.rfind ~pattern s (String.length s-1)
(* Replace substring [s.[pos]....s.[pos+len-1]] by [by] in [s] *)
let replace_at_ ~pos ~len ~by s =
@ -229,11 +258,11 @@ let replace ?(which=`All) ~sub ~by s =
if i>=0 then replace_at_ ~pos:i ~len:(String.length sub) ~by s else s
| `All ->
(* compile search pattern only once *)
let pattern = KMP.compile sub in
let pattern = Find.compile ~sub in
let b = Buffer.create (String.length s) in
let start = ref 0 in
while !start < String.length s do
let i = KMP.find ~pattern s !start in
let i = Find.find ~pattern s !start in
if i>=0 then (
(* between last and cur occurrences *)
Buffer.add_substring b s !start (i- !start);
@ -256,16 +285,16 @@ module Split = struct
| SplitStop -> None
| SplitAt prev -> _split_search ~by s prev
and _split_search ~by s prev =
let j = KMP.find ~pattern:by s prev in
let j = Find.find ~pattern:by s prev in
if j < 0
then Some (SplitStop, prev, String.length s - prev)
else Some (SplitAt (j+KMP.pattern_length by), prev, j-prev)
else Some (SplitAt (j+Find.pattern_length by), prev, j-prev)
let _tuple3 x y z = x,y,z
let _mkgen ~by s k =
let state = ref (SplitAt 0) in
let by = KMP.compile by in
let by = Find.compile ~sub:by in
fun () ->
match _split ~by s !state with
| None -> None
@ -278,7 +307,7 @@ module Split = struct
let gen_cpy ~by s = _mkgen ~by s String.sub
let _mklist ~by s k =
let by = KMP.compile by in
let by = Find.compile ~sub:by in
let rec build acc state = match _split ~by s state with
| None -> List.rev acc
| Some (state', i, len) ->
@ -291,7 +320,7 @@ module Split = struct
let list_cpy ~by s = _mklist ~by s String.sub
let _mkklist ~by s k =
let by = KMP.compile by in
let by = Find.compile ~sub:by in
let rec make state () = match _split ~by s state with
| None -> `Nil
| Some (state', i, len) ->
@ -303,7 +332,7 @@ module Split = struct
let klist_cpy ~by s = _mkklist ~by s String.sub
let _mkseq ~by s f k =
let by = KMP.compile by in
let by = Find.compile ~sub:by in
let rec aux state = match _split ~by s state with
| None -> ()
| Some (state', i, len) -> k (f s i len); aux state'