mirror of
https://github.com/c-cube/ezcurl.git
synced 2025-12-05 19:00:34 -05:00
feat: add an example, a simple web crawler that looks for deadlinks
This commit is contained in:
parent
54e7d1740f
commit
863a6263bb
3 changed files with 169 additions and 0 deletions
3
argiope.sh
Executable file
3
argiope.sh
Executable file
|
|
@ -0,0 +1,3 @@
|
||||||
|
#!/usr/bin/sh
|
||||||
|
|
||||||
|
exec dune exec examples/argiope/argiope.exe --profile=release -- $@
|
||||||
160
examples/argiope/argiope.ml
Normal file
160
examples/argiope/argiope.ml
Normal file
|
|
@ -0,0 +1,160 @@
|
||||||
|
open Lwt.Infix
|
||||||
|
module Str_set = CCSet.Make(String)
|
||||||
|
module Uri_tbl = CCHashtbl.Make(struct
|
||||||
|
include Uri
|
||||||
|
let hash u = Hashtbl.hash (to_string u)
|
||||||
|
end)
|
||||||
|
|
||||||
|
let verbose_ = ref 0
|
||||||
|
|
||||||
|
module Run = struct
|
||||||
|
type t = {
|
||||||
|
domains: Str_set.t; (* domains to recursively crawl *)
|
||||||
|
tasks: Uri.t Queue.t;
|
||||||
|
default_host: string;
|
||||||
|
max: int;
|
||||||
|
seen: unit Uri_tbl.t; (* already explored *)
|
||||||
|
mutable bad: Uri.t list;
|
||||||
|
mutable n: int; (* number of pages crawled *)
|
||||||
|
j: int;
|
||||||
|
}
|
||||||
|
|
||||||
|
let push_task (self:t) u : unit =
|
||||||
|
let u = Uri.canonicalize u in
|
||||||
|
if not @@ Uri_tbl.mem self.seen u then (
|
||||||
|
Uri_tbl.add self.seen u ();
|
||||||
|
Queue.push u self.tasks
|
||||||
|
)
|
||||||
|
|
||||||
|
let make ~j ~domains ~default_host ~max start : t =
|
||||||
|
let tasks = Queue.create () in
|
||||||
|
(* include the domains of [start] in [domains] *)
|
||||||
|
let domains =
|
||||||
|
List.fold_left
|
||||||
|
(fun set uri -> match Uri.host uri with
|
||||||
|
| None -> set
|
||||||
|
| Some h -> Str_set.add h set)
|
||||||
|
domains start
|
||||||
|
in
|
||||||
|
let self = {
|
||||||
|
domains; j; max; tasks; default_host; seen=Uri_tbl.create 256;
|
||||||
|
bad=[]; n=0;
|
||||||
|
} in
|
||||||
|
List.iter (fun uri -> push_task self uri) start;
|
||||||
|
self
|
||||||
|
|
||||||
|
let bad_code c = c >= 400
|
||||||
|
|
||||||
|
let find_urls (body:string) : Uri.t list =
|
||||||
|
let body = Soup.parse body in
|
||||||
|
let open Soup.Infix in
|
||||||
|
let nodes = body $$ "a[href]" in
|
||||||
|
Soup.fold
|
||||||
|
(fun l n ->
|
||||||
|
try
|
||||||
|
let url' = Soup.R.attribute "href" n in
|
||||||
|
Uri.of_string url' :: l
|
||||||
|
with _ -> l)
|
||||||
|
[] nodes
|
||||||
|
|
||||||
|
let worker (self:t) : unit Lwt.t =
|
||||||
|
let client = Ezcurl_lwt.make () in
|
||||||
|
let rec loop() =
|
||||||
|
if Queue.is_empty self.tasks then Lwt.return ()
|
||||||
|
else if self.max >= 0 && self.n > self.max then Lwt.return ()
|
||||||
|
else (
|
||||||
|
let uri = Queue.pop self.tasks in
|
||||||
|
if !verbose_>0 then Printf.eprintf "crawl %s\n%!" (Uri.to_string uri);
|
||||||
|
(* fetch URL (only 500kb) *)
|
||||||
|
self.n <- 1 + self.n;
|
||||||
|
Ezcurl_lwt.get ~client ~range:"0-500000"~url:(Uri.to_string uri) ()
|
||||||
|
>>= fun resp ->
|
||||||
|
begin match resp with
|
||||||
|
| Ok {Ezcurl_lwt.code; body; _} ->
|
||||||
|
if bad_code code then (
|
||||||
|
if !verbose_>1 then (
|
||||||
|
Printf.eprintf "bad code when fetching %s: %d\n%!" (Uri.to_string uri) code;
|
||||||
|
);
|
||||||
|
self.bad <- uri :: self.bad; (* bad URL! *)
|
||||||
|
) else (
|
||||||
|
(* if !verbose_ then Printf.eprintf "body for %s:\n%s\n" (Uri.to_string uri) body; *)
|
||||||
|
let cur_host = Uri.host_with_default ~default:self.default_host uri in
|
||||||
|
let uris = find_urls body in
|
||||||
|
List.iter
|
||||||
|
(fun uri' ->
|
||||||
|
match Uri.host uri' with
|
||||||
|
| Some h when Str_set.mem h self.domains ->
|
||||||
|
(* follow this link *)
|
||||||
|
if !verbose_>1 then Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri');
|
||||||
|
push_task self uri'
|
||||||
|
| Some _ -> ()
|
||||||
|
| None ->
|
||||||
|
(* relative URL, make it absolute *)
|
||||||
|
let uri' = Uri.with_host uri' (Some cur_host) in
|
||||||
|
let uri' = Uri.with_scheme uri' (Uri.scheme uri) in
|
||||||
|
if !verbose_>1 then Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri');
|
||||||
|
push_task self uri'
|
||||||
|
)
|
||||||
|
uris;
|
||||||
|
|
||||||
|
);
|
||||||
|
Lwt.return ()
|
||||||
|
| Error (_, msg) ->
|
||||||
|
if !verbose_>2 then (
|
||||||
|
Printf.eprintf "error when fetching %s:\n %s\n%!" (Uri.to_string uri) msg;
|
||||||
|
);
|
||||||
|
self.bad <- uri :: self.bad; (* bad URL! *)
|
||||||
|
Lwt.return ()
|
||||||
|
end
|
||||||
|
>>= loop (* recurse *)
|
||||||
|
)
|
||||||
|
in
|
||||||
|
loop()
|
||||||
|
|
||||||
|
let run (self:t) : _ Lwt.t =
|
||||||
|
Printf.printf "run %d jobs…\ndomain(s): [%s]\n%!" self.j
|
||||||
|
(String.concat "," @@ Str_set.elements self.domains);
|
||||||
|
let workers = CCList.init self.j (fun _ -> worker self) in
|
||||||
|
(* wait for all workers to be done *)
|
||||||
|
Lwt.join workers >|= fun () ->
|
||||||
|
self.bad, self.n, Queue.length self.tasks
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
let () =
|
||||||
|
let domains = ref Str_set.empty in
|
||||||
|
let start = ref [] in
|
||||||
|
let j = ref 20 in
|
||||||
|
let max_ = ref ~-1 in
|
||||||
|
let opts = [
|
||||||
|
"-v", Arg.Unit (fun _ -> incr verbose_), " verbose";
|
||||||
|
"--domain", Arg.String (fun s -> domains := Str_set.add s !domains), " include given domain";
|
||||||
|
"-d", Arg.String (fun s -> domains := Str_set.add s !domains), " alias to --domainm";
|
||||||
|
"--max", Arg.Set_int max_, " max number of pages to explore";
|
||||||
|
"-j", Arg.Set_int j, " number of jobs (default 20)";
|
||||||
|
] |> Arg.align in
|
||||||
|
Arg.parse opts (CCList.Ref.push start) "usage: crawler url [url*] [option*]";
|
||||||
|
if !start = [] then (
|
||||||
|
failwith "need at least one initial URL"
|
||||||
|
) else (
|
||||||
|
let start = List.map Uri.of_string !start in
|
||||||
|
let default_host = match Uri.host @@ List.hd start with
|
||||||
|
| Some h -> h
|
||||||
|
| _ -> failwith "need absolute URIs"
|
||||||
|
| exception _ -> failwith "need absolute URIs"
|
||||||
|
in
|
||||||
|
let run =
|
||||||
|
Run.make ~default_host ~j:!j ~domains:!domains ~max:!max_ start
|
||||||
|
in
|
||||||
|
(* crawl *)
|
||||||
|
let bad, num, remaining = Lwt_main.run (Run.run run) in
|
||||||
|
if bad <> [] then (
|
||||||
|
Printf.printf "ERROR: crawled %d pages, %d dead links (%d remaining)\n"
|
||||||
|
num (List.length bad) remaining;
|
||||||
|
List.iter (fun uri -> Printf.printf " dead: %s\n" (Uri.to_string uri)) bad;
|
||||||
|
exit 1
|
||||||
|
) else (
|
||||||
|
Printf.printf "OK: crawled %d pages (remaining %d)\n" num remaining
|
||||||
|
)
|
||||||
|
)
|
||||||
6
examples/argiope/dune
Normal file
6
examples/argiope/dune
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
|
||||||
|
(executable
|
||||||
|
(name argiope)
|
||||||
|
(synopsis "A web crawler that can typically be found in Texas")
|
||||||
|
(modes native)
|
||||||
|
(libraries uri lambdasoup lwt ezcurl-lwt containers))
|
||||||
Loading…
Add table
Reference in a new issue