mirror of
https://github.com/c-cube/ocaml-containers.git
synced 2025-12-06 11:15:31 -05:00
more robust crawler
This commit is contained in:
parent
a5d9a39afd
commit
c582e18cfa
1 changed files with 4 additions and 3 deletions
|
|
@ -8,7 +8,7 @@ let pool = Future.Pool.create ~timeout:15. ~size:15
|
||||||
let split_lines s = String.nsplit s ~by:"\n"
|
let split_lines s = String.nsplit s ~by:"\n"
|
||||||
|
|
||||||
let get_and_parse url =
|
let get_and_parse url =
|
||||||
let cmd = Format.sprintf "wget -q '%s' -O - | grep -o 'http://[^ \"]*.html'" url in
|
let cmd = Format.sprintf "wget -q '%s' -O - | grep -o 'http\\(s\\)\\?://[^ \"]\\+'" url in
|
||||||
let content = Future.spawn_process ?stdin:None ~pool ~cmd in
|
let content = Future.spawn_process ?stdin:None ~pool ~cmd in
|
||||||
content
|
content
|
||||||
|> Future.map (fun (_, stdout, _) -> stdout)
|
|> Future.map (fun (_, stdout, _) -> stdout)
|
||||||
|
|
@ -22,8 +22,9 @@ type page = string * (string list Future.t)
|
||||||
let g : (page, unit, unit) LazyGraph.t =
|
let g : (page, unit, unit) LazyGraph.t =
|
||||||
let force (url, future) =
|
let force (url, future) =
|
||||||
Format.printf "force %s@." url;
|
Format.printf "force %s@." url;
|
||||||
let urls = Future.get future
|
let urls =
|
||||||
|> List.map (fun url -> (), (url, get_and_parse url)) in
|
try Future.get future |> List.map (fun url -> (), (url, get_and_parse url))
|
||||||
|
with e -> [] in
|
||||||
let edges = Gen.of_list urls in
|
let edges = Gen.of_list urls in
|
||||||
(* need to parse the page to get the urls *)
|
(* need to parse the page to get the urls *)
|
||||||
LazyGraph.Node ((url, future), (), edges)
|
LazyGraph.Node ((url, future), (), edges)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue