reformat with 0.27

This commit is contained in:
Simon Cruanes 2026-01-20 20:50:01 -05:00
parent 2b6297c075
commit 69b73e0f69
No known key found for this signature in database
GPG key ID: EBFFF6F283F3A2B4
9 changed files with 177 additions and 174 deletions

5
dune
View file

@ -1,10 +1,9 @@
(rule
(alias runtest)
(package ezcurl-lwt)
(deps (:file README.md))
(deps
(:file README.md))
(action
(progn
(run ocaml-mdx test %{deps})
(diff? %{file} %{file}.corrected))))

View file

@ -1,9 +1,11 @@
open Lwt.Infix
module Str_set = CCSet.Make(String)
module Uri_tbl = CCHashtbl.Make(struct
module Str_set = CCSet.Make (String)
module Uri_tbl = CCHashtbl.Make (struct
include Uri
let hash u = Hashtbl.hash (to_string u)
end)
end)
let verbose_ = ref 0
@ -19,7 +21,7 @@ module Run = struct
j: int;
}
let push_task (self:t) u : unit =
let push_task (self : t) u : unit =
let u = Uri.canonicalize u in
if not @@ Uri_tbl.mem self.seen u then (
Uri_tbl.add self.seen u ();
@ -31,21 +33,30 @@ module Run = struct
(* include the domains of [start] in [domains] *)
let domains =
List.fold_left
(fun set uri -> match Uri.host uri with
(fun set uri ->
match Uri.host uri with
| None -> set
| Some h -> Str_set.add h set)
domains start
in
let self = {
domains; j; max; tasks; default_host; seen=Uri_tbl.create 256;
bad=[]; n=0;
} in
let self =
{
domains;
j;
max;
tasks;
default_host;
seen = Uri_tbl.create 256;
bad = [];
n = 0;
}
in
List.iter (fun uri -> push_task self uri) start;
self
let bad_code c = c >= 400
let find_urls (body:string) : Uri.t list =
let find_urls (body : string) : Uri.t list =
let body = Soup.parse body in
let open Soup.Infix in
let nodes = body $$ "a[href]" in
@ -57,72 +68,74 @@ module Run = struct
with _ -> l)
[] nodes
let worker (self:t) : unit Lwt.t =
let worker (self : t) : unit Lwt.t =
let client = Ezcurl_lwt.make () in
let rec loop() =
if Queue.is_empty self.tasks then Lwt.return ()
else if self.max >= 0 && self.n > self.max then Lwt.return ()
let rec loop () =
if Queue.is_empty self.tasks then
Lwt.return ()
else if self.max >= 0 && self.n > self.max then
Lwt.return ()
else (
let uri = Queue.pop self.tasks in
if !verbose_>0 then Printf.eprintf "crawl %s\n%!" (Uri.to_string uri);
if !verbose_ > 0 then Printf.eprintf "crawl %s\n%!" (Uri.to_string uri);
(* fetch URL (only 500kb) *)
self.n <- 1 + self.n;
Ezcurl_lwt.get ~client ~range:"0-500000"~url:(Uri.to_string uri) ()
Ezcurl_lwt.get ~client ~range:"0-500000" ~url:(Uri.to_string uri) ()
>>= fun resp ->
begin match resp with
| Ok {Ezcurl_lwt.code; body; _} ->
(match resp with
| Ok { Ezcurl_lwt.code; body; _ } ->
if bad_code code then (
if !verbose_>1 then (
Printf.eprintf "bad code when fetching %s: %d\n%!" (Uri.to_string uri) code;
);
self.bad <- uri :: self.bad; (* bad URL! *)
if !verbose_ > 1 then
Printf.eprintf "bad code when fetching %s: %d\n%!"
(Uri.to_string uri) code;
self.bad <- uri :: self.bad (* bad URL! *)
) else (
(* if !verbose_ then Printf.eprintf "body for %s:\n%s\n" (Uri.to_string uri) body; *)
let cur_host = Uri.host_with_default ~default:self.default_host uri in
let cur_host =
Uri.host_with_default ~default:self.default_host uri
in
let uris = find_urls body in
List.iter
(fun uri' ->
match Uri.host uri' with
| Some h when Str_set.mem h self.domains ->
(* follow this link *)
if !verbose_>1 then Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri');
if !verbose_ > 1 then
Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri');
push_task self uri'
| Some _ -> ()
| None ->
(* relative URL, make it absolute *)
let uri' = Uri.with_host uri' (Some cur_host) in
let uri' = Uri.with_scheme uri' (Uri.scheme uri) in
if !verbose_>1 then Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri');
push_task self uri'
)
uris;
if !verbose_ > 1 then
Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri');
push_task self uri')
uris
);
Lwt.return ()
| Error (_, msg) ->
if !verbose_>2 then (
Printf.eprintf "error when fetching %s:\n %s\n%!" (Uri.to_string uri) msg;
);
self.bad <- uri :: self.bad; (* bad URL! *)
Lwt.return ()
end
if !verbose_ > 2 then
Printf.eprintf "error when fetching %s:\n %s\n%!"
(Uri.to_string uri) msg;
self.bad <- uri :: self.bad;
(* bad URL! *)
Lwt.return ())
>>= loop (* recurse *)
)
in
loop()
loop ()
let run (self:t) : _ Lwt.t =
let run (self : t) : _ Lwt.t =
Printf.printf "run %d jobs…\ndomain(s): [%s]\n%!" self.j
(String.concat "," @@ Str_set.elements self.domains);
let workers = CCList.init self.j (fun _ -> worker self) in
(* wait for all workers to be done *)
Lwt.join workers >|= fun () ->
self.bad, self.n, Queue.length self.tasks
Lwt.join workers >|= fun () -> self.bad, self.n, Queue.length self.tasks
end
let help_str =
{|A web crawler that can typically be found in Texas.
{|A web crawler that can typically be found in Texas.
usage: argiope url [url*] [option*]
|}
@ -132,34 +145,41 @@ let () =
let start = ref [] in
let j = ref 20 in
let max_ = ref ~-1 in
let opts = [
let opts =
[
"-v", Arg.Unit (fun _ -> incr verbose_), " verbose";
"--domain", Arg.String (fun s -> domains := Str_set.add s !domains), " include given domain";
"-d", Arg.String (fun s -> domains := Str_set.add s !domains), " alias to --domainm";
( "--domain",
Arg.String (fun s -> domains := Str_set.add s !domains),
" include given domain" );
( "-d",
Arg.String (fun s -> domains := Str_set.add s !domains),
" alias to --domainm" );
"--max", Arg.Set_int max_, " max number of pages to explore";
"-j", Arg.Set_int j, " number of jobs (default 20)";
] |> Arg.align in
]
|> Arg.align
in
Arg.parse opts (CCList.Ref.push start) help_str;
if !start = [] then (
Arg.usage opts help_str;
) else (
if !start = [] then
Arg.usage opts help_str
else (
let start = List.map Uri.of_string !start in
let default_host = match Uri.host @@ List.hd start with
let default_host =
match Uri.host @@ List.hd start with
| Some h -> h
| _ -> failwith "need absolute URIs"
| exception _ -> failwith "need absolute URIs"
in
let run =
Run.make ~default_host ~j:!j ~domains:!domains ~max:!max_ start
in
let run = Run.make ~default_host ~j:!j ~domains:!domains ~max:!max_ start in
(* crawl *)
let bad, num, remaining = Lwt_main.run (Run.run run) in
if bad <> [] then (
Printf.printf "ERROR: crawled %d pages, %d dead links (%d remaining)\n"
num (List.length bad) remaining;
List.iter (fun uri -> Printf.printf " dead: %s\n" (Uri.to_string uri)) bad;
List.iter
(fun uri -> Printf.printf " dead: %s\n" (Uri.to_string uri))
bad;
exit 1
) else (
) else
Printf.printf "OK: crawled %d pages (remaining %d)\n" num remaining
)
)

View file

@ -1,4 +1,3 @@
(executable
(name argiope)
(modes native)

View file

@ -1,4 +1,3 @@
(library
(name ezcurl_core)
(public_name ezcurl.core)

View file

@ -246,20 +246,19 @@ module type S = sig
@param meth which method to use (see {!meth})
@param tries how many times to retry in case of [CURLE_AGAIN] code
@param client a client to reuse (instead of allocating a new one)
@param range an optional
{{: https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests} byte range}
to fetch (either to get large pages
by chunks, or to resume an interrupted download).
@param range
an optional
{{:https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests}
byte range} to fetch (either to get large pages by chunks, or to resume
an interrupted download).
@param config configuration to set
@param content the content to send as the query's body, either
a [`String s] to write a single string, or [`Write f]
where [f] is a callback that is called on a buffer [b] with len [n]
(as in [f b n]) and returns how many bytes it wrote in the buffer
[b] starting at index [0] (at most [n] bytes).
It must return [0] when the content is entirely written, and not
before.
@param headers headers of the query
*)
@param content
the content to send as the query's body, either a [`String s] to write a
single string, or [`Write f] where [f] is a callback that is called on a
buffer [b] with len [n] (as in [f b n]) and returns how many bytes it
wrote in the buffer [b] starting at index [0] (at most [n] bytes). It
must return [0] when the content is entirely written, and not before.
@param headers headers of the query *)
(** Push-stream of bytes
@since NEXT_RELEASE *)
@ -292,9 +291,7 @@ module type S = sig
url:string ->
unit ->
(string response, Curl.curlCode * string) result io
(** Shortcut for [http ~meth:GET]
See {!http} for more info.
*)
(** Shortcut for [http ~meth:GET] See {!http} for more info. *)
val put :
?tries:int ->
@ -305,9 +302,7 @@ module type S = sig
content:[ `String of string | `Write of bytes -> int -> int ] ->
unit ->
(string response, Curl.curlCode * string) result io
(** Shortcut for [http ~meth:PUT]
See {!http} for more info.
*)
(** Shortcut for [http ~meth:PUT] See {!http} for more info. *)
val post :
?tries:int ->
@ -319,9 +314,7 @@ module type S = sig
url:string ->
unit ->
(string response, Curl.curlCode * string) result io
(** Shortcut for [http ~meth:(POST params)]
See {!http} for more info.
*)
(** Shortcut for [http ~meth:(POST params)] See {!http} for more info. *)
end
exception Parse_error of Curl.curlCode * string
@ -368,22 +361,21 @@ module Make (IO : IO) : S with type 'a io = 'a IO.t = struct
n := !n + len;
r
and seek i o =
begin match o with
(match o with
| Curl.SEEK_SET -> n := Int64.to_int i
| SEEK_END -> n := String.length s + Int64.to_int i
| SEEK_CUR -> n := !n + Int64.to_int i
end;
| SEEK_CUR -> n := !n + Int64.to_int i);
Curl.SEEKFUNC_OK
in read, seek
in
read, seek
| `Write f ->
let buf = Bytes.create 1024 in
let read i =
let len = min i (Bytes.length buf) in
let n = f buf len in
Bytes.sub_string buf i n
and seek _ _ =
Curl.SEEKFUNC_CANTSEEK
in read, seek
and seek _ _ = Curl.SEEKFUNC_CANTSEEK in
read, seek
let content_size_ = function
| `String s -> Some (String.length s)

View file

@ -16,8 +16,8 @@ module Config : sig
end
type t = private { curl: Curl.t } [@@unboxed]
(** A client, i.e. a cURL instance.
The wrapping record has been present since NEXT_RELEASE *)
(** A client, i.e. a cURL instance. The wrapping record has been present since
NEXT_RELEASE *)
val make :
?set_opts:(Curl.t -> unit) ->
@ -27,9 +27,12 @@ val make :
t
(** Create a new client.
@param set_opts called before returning the client, to set options
@param cookiejar_file if provided, tell curl to use the given file path to store/load cookies (since NEXT_RELEASE)
@param enable_session_cookies if provided, enable cookie handling in curl so it store/load cookies (since NEXT_RELEASE)
*)
@param cookiejar_file
if provided, tell curl to use the given file path to store/load cookies
(since NEXT_RELEASE)
@param enable_session_cookies
if provided, enable cookie handling in curl so it store/load cookies
(since NEXT_RELEASE) *)
val delete : t -> unit
(** Delete the client. It cannot be used anymore. *)
@ -44,16 +47,16 @@ val set_no_signal : bool -> unit
(** Cookie handling.
@since NEXT_RELEASE *)
@since NEXT_RELEASE *)
module Cookies : sig
val flush_cookiejar : t -> unit
(** If [cookiejar_file] was provided in {!make}, this flushes the current set of cookies
to the provided file.
(** If [cookiejar_file] was provided in {!make}, this flushes the current set
of cookies to the provided file.
@since NEXT_RELEASE *)
val reload_cookiejar : t -> unit
(** If [cookiejar_file] was provided in {!make}, this reloads cookies from
the provided file.
(** If [cookiejar_file] was provided in {!make}, this reloads cookies from the
provided file.
@since NEXT_RELEASE *)
val get_cookies : t -> string list
@ -72,11 +75,10 @@ end
type response_info = {
ri_response_time: float;
(** Total time (in seconds) for the request/response pair.
See {!Curl.get_totaltime}. *)
(** Total time (in seconds) for the request/response pair. See
{!Curl.get_totaltime}. *)
ri_redirect_count: int;
(** Number of redirects cURL followed.
See {!Curl.get_redirectcount}. *)
(** Number of redirects cURL followed. See {!Curl.get_redirectcount}. *)
}
(** Metadata about a response from the server. *)
@ -85,7 +87,8 @@ val string_of_response_info : response_info -> string
type 'body response = {
code: int;
(** Response code. See https://developer.mozilla.org/en-US/docs/Web/HTTP/Status *)
(** Response code. See
https://developer.mozilla.org/en-US/docs/Web/HTTP/Status *)
headers: (string * string) list; (** Response headers *)
body: 'body; (** Response body, or [""] *)
info: response_info; (** Information about the response *)
@ -98,7 +101,8 @@ val pp_response_with :
val pp_response : Format.formatter -> string response -> unit
val string_of_response : string response -> string
(** The {{: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods} HTTP method}
(** The
{{:https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods} HTTP method}
to use *)
type meth =
| GET
@ -145,20 +149,19 @@ module type S = sig
@param meth which method to use (see {!meth})
@param tries how many times to retry in case of [CURLE_AGAIN] code
@param client a client to reuse (instead of allocating a new one)
@param range an optional
{{: https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests} byte range}
to fetch (either to get large pages
by chunks, or to resume an interrupted download).
@param range
an optional
{{:https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests}
byte range} to fetch (either to get large pages by chunks, or to resume
an interrupted download).
@param config configuration to set
@param content the content to send as the query's body, either
a [`String s] to write a single string, or [`Write f]
where [f] is a callback that is called on a buffer [b] with len [n]
(as in [f b n]) and returns how many bytes it wrote in the buffer
[b] starting at index [0] (at most [n] bytes).
It must return [0] when the content is entirely written, and not
before.
@param headers headers of the query
*)
@param content
the content to send as the query's body, either a [`String s] to write a
single string, or [`Write f] where [f] is a callback that is called on a
buffer [b] with len [n] (as in [f b n]) and returns how many bytes it
wrote in the buffer [b] starting at index [0] (at most [n] bytes). It
must return [0] when the content is entirely written, and not before.
@param headers headers of the query *)
(** Push-based stream of bytes
@since NEXT_RELEASE *)
@ -179,10 +182,9 @@ module type S = sig
write_into:#input_stream ->
unit ->
(unit response, Curl.curlCode * string) result io
(** HTTP call via cURL, with a streaming response body.
The body is given to [write_into] by chunks,
then [write_into#on_close ()] is called
and the response is returned.
(** HTTP call via cURL, with a streaming response body. The body is given to
[write_into] by chunks, then [write_into#on_close ()] is called and the
response is returned.
@since NEXT_RELEASE *)
val get :
@ -194,9 +196,7 @@ module type S = sig
url:string ->
unit ->
(string response, Curl.curlCode * string) result io
(** Shortcut for [http ~meth:GET]
See {!http} for more info.
*)
(** Shortcut for [http ~meth:GET] See {!http} for more info. *)
val put :
?tries:int ->
@ -207,9 +207,7 @@ module type S = sig
content:[ `String of string | `Write of bytes -> int -> int ] ->
unit ->
(string response, Curl.curlCode * string) result io
(** Shortcut for [http ~meth:PUT]
See {!http} for more info.
*)
(** Shortcut for [http ~meth:PUT] See {!http} for more info. *)
val post :
?tries:int ->
@ -221,9 +219,7 @@ module type S = sig
url:string ->
unit ->
(string response, Curl.curlCode * string) result io
(** Shortcut for [http ~meth:(POST params)]
See {!http} for more info.
*)
(** Shortcut for [http ~meth:(POST params)] See {!http} for more info. *)
end
module Make (IO : IO) : S with type 'a io = 'a IO.t

View file

@ -1,4 +1,3 @@
(library
(name ezcurl_lwt)
(public_name ezcurl-lwt)

View file

@ -16,4 +16,3 @@ include Ezcurl_core.Make (struct
Curl.CURLE_OK
with Curl.CurlException (c, _, _) -> c
end)