diff --git a/argiope.sh b/argiope.sh new file mode 100755 index 0000000..39b35e6 --- /dev/null +++ b/argiope.sh @@ -0,0 +1,3 @@ +#!/usr/bin/sh + +exec dune exec examples/argiope/argiope.exe --profile=release -- $@ diff --git a/examples/argiope/argiope.ml b/examples/argiope/argiope.ml new file mode 100644 index 0000000..2107385 --- /dev/null +++ b/examples/argiope/argiope.ml @@ -0,0 +1,160 @@ +open Lwt.Infix +module Str_set = CCSet.Make(String) +module Uri_tbl = CCHashtbl.Make(struct + include Uri + let hash u = Hashtbl.hash (to_string u) + end) + +let verbose_ = ref 0 + +module Run = struct + type t = { + domains: Str_set.t; (* domains to recursively crawl *) + tasks: Uri.t Queue.t; + default_host: string; + max: int; + seen: unit Uri_tbl.t; (* already explored *) + mutable bad: Uri.t list; + mutable n: int; (* number of pages crawled *) + j: int; + } + + let push_task (self:t) u : unit = + let u = Uri.canonicalize u in + if not @@ Uri_tbl.mem self.seen u then ( + Uri_tbl.add self.seen u (); + Queue.push u self.tasks + ) + + let make ~j ~domains ~default_host ~max start : t = + let tasks = Queue.create () in + (* include the domains of [start] in [domains] *) + let domains = + List.fold_left + (fun set uri -> match Uri.host uri with + | None -> set + | Some h -> Str_set.add h set) + domains start + in + let self = { + domains; j; max; tasks; default_host; seen=Uri_tbl.create 256; + bad=[]; n=0; + } in + List.iter (fun uri -> push_task self uri) start; + self + + let bad_code c = c >= 400 + + let find_urls (body:string) : Uri.t list = + let body = Soup.parse body in + let open Soup.Infix in + let nodes = body $$ "a[href]" in + Soup.fold + (fun l n -> + try + let url' = Soup.R.attribute "href" n in + Uri.of_string url' :: l + with _ -> l) + [] nodes + + let worker (self:t) : unit Lwt.t = + let client = Ezcurl_lwt.make () in + let rec loop() = + if Queue.is_empty self.tasks then Lwt.return () + else if self.max >= 0 && self.n > self.max then Lwt.return () + else ( + let uri = Queue.pop self.tasks in + if !verbose_>0 then Printf.eprintf "crawl %s\n%!" (Uri.to_string uri); + (* fetch URL (only 500kb) *) + self.n <- 1 + self.n; + Ezcurl_lwt.get ~client ~range:"0-500000"~url:(Uri.to_string uri) () + >>= fun resp -> + begin match resp with + | Ok {Ezcurl_lwt.code; body; _} -> + if bad_code code then ( + if !verbose_>1 then ( + Printf.eprintf "bad code when fetching %s: %d\n%!" (Uri.to_string uri) code; + ); + self.bad <- uri :: self.bad; (* bad URL! *) + ) else ( + (* if !verbose_ then Printf.eprintf "body for %s:\n%s\n" (Uri.to_string uri) body; *) + let cur_host = Uri.host_with_default ~default:self.default_host uri in + let uris = find_urls body in + List.iter + (fun uri' -> + match Uri.host uri' with + | Some h when Str_set.mem h self.domains -> + (* follow this link *) + if !verbose_>1 then Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri'); + push_task self uri' + | Some _ -> () + | None -> + (* relative URL, make it absolute *) + let uri' = Uri.with_host uri' (Some cur_host) in + let uri' = Uri.with_scheme uri' (Uri.scheme uri) in + if !verbose_>1 then Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri'); + push_task self uri' + ) + uris; + + ); + Lwt.return () + | Error (_, msg) -> + if !verbose_>2 then ( + Printf.eprintf "error when fetching %s:\n %s\n%!" (Uri.to_string uri) msg; + ); + self.bad <- uri :: self.bad; (* bad URL! *) + Lwt.return () + end + >>= loop (* recurse *) + ) + in + loop() + + let run (self:t) : _ Lwt.t = + Printf.printf "run %d jobs…\ndomain(s): [%s]\n%!" self.j + (String.concat "," @@ Str_set.elements self.domains); + let workers = CCList.init self.j (fun _ -> worker self) in + (* wait for all workers to be done *) + Lwt.join workers >|= fun () -> + self.bad, self.n, Queue.length self.tasks +end + + + +let () = + let domains = ref Str_set.empty in + let start = ref [] in + let j = ref 20 in + let max_ = ref ~-1 in + let opts = [ + "-v", Arg.Unit (fun _ -> incr verbose_), " verbose"; + "--domain", Arg.String (fun s -> domains := Str_set.add s !domains), " include given domain"; + "-d", Arg.String (fun s -> domains := Str_set.add s !domains), " alias to --domainm"; + "--max", Arg.Set_int max_, " max number of pages to explore"; + "-j", Arg.Set_int j, " number of jobs (default 20)"; + ] |> Arg.align in + Arg.parse opts (CCList.Ref.push start) "usage: crawler url [url*] [option*]"; + if !start = [] then ( + failwith "need at least one initial URL" + ) else ( + let start = List.map Uri.of_string !start in + let default_host = match Uri.host @@ List.hd start with + | Some h -> h + | _ -> failwith "need absolute URIs" + | exception _ -> failwith "need absolute URIs" + in + let run = + Run.make ~default_host ~j:!j ~domains:!domains ~max:!max_ start + in + (* crawl *) + let bad, num, remaining = Lwt_main.run (Run.run run) in + if bad <> [] then ( + Printf.printf "ERROR: crawled %d pages, %d dead links (%d remaining)\n" + num (List.length bad) remaining; + List.iter (fun uri -> Printf.printf " dead: %s\n" (Uri.to_string uri)) bad; + exit 1 + ) else ( + Printf.printf "OK: crawled %d pages (remaining %d)\n" num remaining + ) + ) diff --git a/examples/argiope/dune b/examples/argiope/dune new file mode 100644 index 0000000..1215ee6 --- /dev/null +++ b/examples/argiope/dune @@ -0,0 +1,6 @@ + +(executable + (name argiope) + (synopsis "A web crawler that can typically be found in Texas") + (modes native) + (libraries uri lambdasoup lwt ezcurl-lwt containers))