ezcurl/examples/argiope/argiope.ml
2026-01-20 20:50:01 -05:00

185 lines
5.6 KiB
OCaml

open Lwt.Infix
module Str_set = CCSet.Make (String)
module Uri_tbl = CCHashtbl.Make (struct
include Uri
let hash u = Hashtbl.hash (to_string u)
end)
let verbose_ = ref 0
module Run = struct
type t = {
domains: Str_set.t; (* domains to recursively crawl *)
tasks: Uri.t Queue.t;
default_host: string;
max: int;
seen: unit Uri_tbl.t; (* already explored *)
mutable bad: Uri.t list;
mutable n: int; (* number of pages crawled *)
j: int;
}
let push_task (self : t) u : unit =
let u = Uri.canonicalize u in
if not @@ Uri_tbl.mem self.seen u then (
Uri_tbl.add self.seen u ();
Queue.push u self.tasks
)
let make ~j ~domains ~default_host ~max start : t =
let tasks = Queue.create () in
(* include the domains of [start] in [domains] *)
let domains =
List.fold_left
(fun set uri ->
match Uri.host uri with
| None -> set
| Some h -> Str_set.add h set)
domains start
in
let self =
{
domains;
j;
max;
tasks;
default_host;
seen = Uri_tbl.create 256;
bad = [];
n = 0;
}
in
List.iter (fun uri -> push_task self uri) start;
self
let bad_code c = c >= 400
let find_urls (body : string) : Uri.t list =
let body = Soup.parse body in
let open Soup.Infix in
let nodes = body $$ "a[href]" in
Soup.fold
(fun l n ->
try
let url' = Soup.R.attribute "href" n in
Uri.of_string url' :: l
with _ -> l)
[] nodes
let worker (self : t) : unit Lwt.t =
let client = Ezcurl_lwt.make () in
let rec loop () =
if Queue.is_empty self.tasks then
Lwt.return ()
else if self.max >= 0 && self.n > self.max then
Lwt.return ()
else (
let uri = Queue.pop self.tasks in
if !verbose_ > 0 then Printf.eprintf "crawl %s\n%!" (Uri.to_string uri);
(* fetch URL (only 500kb) *)
self.n <- 1 + self.n;
Ezcurl_lwt.get ~client ~range:"0-500000" ~url:(Uri.to_string uri) ()
>>= fun resp ->
(match resp with
| Ok { Ezcurl_lwt.code; body; _ } ->
if bad_code code then (
if !verbose_ > 1 then
Printf.eprintf "bad code when fetching %s: %d\n%!"
(Uri.to_string uri) code;
self.bad <- uri :: self.bad (* bad URL! *)
) else (
(* if !verbose_ then Printf.eprintf "body for %s:\n%s\n" (Uri.to_string uri) body; *)
let cur_host =
Uri.host_with_default ~default:self.default_host uri
in
let uris = find_urls body in
List.iter
(fun uri' ->
match Uri.host uri' with
| Some h when Str_set.mem h self.domains ->
(* follow this link *)
if !verbose_ > 1 then
Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri');
push_task self uri'
| Some _ -> ()
| None ->
(* relative URL, make it absolute *)
let uri' = Uri.with_host uri' (Some cur_host) in
let uri' = Uri.with_scheme uri' (Uri.scheme uri) in
if !verbose_ > 1 then
Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri');
push_task self uri')
uris
);
Lwt.return ()
| Error (_, msg) ->
if !verbose_ > 2 then
Printf.eprintf "error when fetching %s:\n %s\n%!"
(Uri.to_string uri) msg;
self.bad <- uri :: self.bad;
(* bad URL! *)
Lwt.return ())
>>= loop (* recurse *)
)
in
loop ()
let run (self : t) : _ Lwt.t =
Printf.printf "run %d jobs…\ndomain(s): [%s]\n%!" self.j
(String.concat "," @@ Str_set.elements self.domains);
let workers = CCList.init self.j (fun _ -> worker self) in
(* wait for all workers to be done *)
Lwt.join workers >|= fun () -> self.bad, self.n, Queue.length self.tasks
end
let help_str =
{|A web crawler that can typically be found in Texas.
usage: argiope url [url*] [option*]
|}
let () =
let domains = ref Str_set.empty in
let start = ref [] in
let j = ref 20 in
let max_ = ref ~-1 in
let opts =
[
"-v", Arg.Unit (fun _ -> incr verbose_), " verbose";
( "--domain",
Arg.String (fun s -> domains := Str_set.add s !domains),
" include given domain" );
( "-d",
Arg.String (fun s -> domains := Str_set.add s !domains),
" alias to --domainm" );
"--max", Arg.Set_int max_, " max number of pages to explore";
"-j", Arg.Set_int j, " number of jobs (default 20)";
]
|> Arg.align
in
Arg.parse opts (CCList.Ref.push start) help_str;
if !start = [] then
Arg.usage opts help_str
else (
let start = List.map Uri.of_string !start in
let default_host =
match Uri.host @@ List.hd start with
| Some h -> h
| _ -> failwith "need absolute URIs"
| exception _ -> failwith "need absolute URIs"
in
let run = Run.make ~default_host ~j:!j ~domains:!domains ~max:!max_ start in
(* crawl *)
let bad, num, remaining = Lwt_main.run (Run.run run) in
if bad <> [] then (
Printf.printf "ERROR: crawled %d pages, %d dead links (%d remaining)\n"
num (List.length bad) remaining;
List.iter
(fun uri -> Printf.printf " dead: %s\n" (Uri.to_string uri))
bad;
exit 1
) else
Printf.printf "OK: crawled %d pages (remaining %d)\n" num remaining
)