reformat with 0.27

This commit is contained in:
Simon Cruanes 2026-01-20 20:50:01 -05:00
parent 2b6297c075
commit 69b73e0f69
No known key found for this signature in database
GPG key ID: EBFFF6F283F3A2B4
9 changed files with 177 additions and 174 deletions

5
dune
View file

@ -1,10 +1,9 @@
(rule (rule
(alias runtest) (alias runtest)
(package ezcurl-lwt) (package ezcurl-lwt)
(deps (:file README.md)) (deps
(:file README.md))
(action (action
(progn (progn
(run ocaml-mdx test %{deps}) (run ocaml-mdx test %{deps})
(diff? %{file} %{file}.corrected)))) (diff? %{file} %{file}.corrected))))

View file

@ -1,9 +1,11 @@
open Lwt.Infix open Lwt.Infix
module Str_set = CCSet.Make(String) module Str_set = CCSet.Make (String)
module Uri_tbl = CCHashtbl.Make(struct
module Uri_tbl = CCHashtbl.Make (struct
include Uri include Uri
let hash u = Hashtbl.hash (to_string u) let hash u = Hashtbl.hash (to_string u)
end) end)
let verbose_ = ref 0 let verbose_ = ref 0
@ -19,7 +21,7 @@ module Run = struct
j: int; j: int;
} }
let push_task (self:t) u : unit = let push_task (self : t) u : unit =
let u = Uri.canonicalize u in let u = Uri.canonicalize u in
if not @@ Uri_tbl.mem self.seen u then ( if not @@ Uri_tbl.mem self.seen u then (
Uri_tbl.add self.seen u (); Uri_tbl.add self.seen u ();
@ -31,21 +33,30 @@ module Run = struct
(* include the domains of [start] in [domains] *) (* include the domains of [start] in [domains] *)
let domains = let domains =
List.fold_left List.fold_left
(fun set uri -> match Uri.host uri with (fun set uri ->
match Uri.host uri with
| None -> set | None -> set
| Some h -> Str_set.add h set) | Some h -> Str_set.add h set)
domains start domains start
in in
let self = { let self =
domains; j; max; tasks; default_host; seen=Uri_tbl.create 256; {
bad=[]; n=0; domains;
} in j;
max;
tasks;
default_host;
seen = Uri_tbl.create 256;
bad = [];
n = 0;
}
in
List.iter (fun uri -> push_task self uri) start; List.iter (fun uri -> push_task self uri) start;
self self
let bad_code c = c >= 400 let bad_code c = c >= 400
let find_urls (body:string) : Uri.t list = let find_urls (body : string) : Uri.t list =
let body = Soup.parse body in let body = Soup.parse body in
let open Soup.Infix in let open Soup.Infix in
let nodes = body $$ "a[href]" in let nodes = body $$ "a[href]" in
@ -57,72 +68,74 @@ module Run = struct
with _ -> l) with _ -> l)
[] nodes [] nodes
let worker (self:t) : unit Lwt.t = let worker (self : t) : unit Lwt.t =
let client = Ezcurl_lwt.make () in let client = Ezcurl_lwt.make () in
let rec loop() = let rec loop () =
if Queue.is_empty self.tasks then Lwt.return () if Queue.is_empty self.tasks then
else if self.max >= 0 && self.n > self.max then Lwt.return () Lwt.return ()
else if self.max >= 0 && self.n > self.max then
Lwt.return ()
else ( else (
let uri = Queue.pop self.tasks in let uri = Queue.pop self.tasks in
if !verbose_>0 then Printf.eprintf "crawl %s\n%!" (Uri.to_string uri); if !verbose_ > 0 then Printf.eprintf "crawl %s\n%!" (Uri.to_string uri);
(* fetch URL (only 500kb) *) (* fetch URL (only 500kb) *)
self.n <- 1 + self.n; self.n <- 1 + self.n;
Ezcurl_lwt.get ~client ~range:"0-500000"~url:(Uri.to_string uri) () Ezcurl_lwt.get ~client ~range:"0-500000" ~url:(Uri.to_string uri) ()
>>= fun resp -> >>= fun resp ->
begin match resp with (match resp with
| Ok {Ezcurl_lwt.code; body; _} -> | Ok { Ezcurl_lwt.code; body; _ } ->
if bad_code code then ( if bad_code code then (
if !verbose_>1 then ( if !verbose_ > 1 then
Printf.eprintf "bad code when fetching %s: %d\n%!" (Uri.to_string uri) code; Printf.eprintf "bad code when fetching %s: %d\n%!"
); (Uri.to_string uri) code;
self.bad <- uri :: self.bad; (* bad URL! *) self.bad <- uri :: self.bad (* bad URL! *)
) else ( ) else (
(* if !verbose_ then Printf.eprintf "body for %s:\n%s\n" (Uri.to_string uri) body; *) (* if !verbose_ then Printf.eprintf "body for %s:\n%s\n" (Uri.to_string uri) body; *)
let cur_host = Uri.host_with_default ~default:self.default_host uri in let cur_host =
Uri.host_with_default ~default:self.default_host uri
in
let uris = find_urls body in let uris = find_urls body in
List.iter List.iter
(fun uri' -> (fun uri' ->
match Uri.host uri' with match Uri.host uri' with
| Some h when Str_set.mem h self.domains -> | Some h when Str_set.mem h self.domains ->
(* follow this link *) (* follow this link *)
if !verbose_>1 then Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri'); if !verbose_ > 1 then
Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri');
push_task self uri' push_task self uri'
| Some _ -> () | Some _ -> ()
| None -> | None ->
(* relative URL, make it absolute *) (* relative URL, make it absolute *)
let uri' = Uri.with_host uri' (Some cur_host) in let uri' = Uri.with_host uri' (Some cur_host) in
let uri' = Uri.with_scheme uri' (Uri.scheme uri) in let uri' = Uri.with_scheme uri' (Uri.scheme uri) in
if !verbose_>1 then Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri'); if !verbose_ > 1 then
push_task self uri' Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri');
) push_task self uri')
uris; uris
); );
Lwt.return () Lwt.return ()
| Error (_, msg) -> | Error (_, msg) ->
if !verbose_>2 then ( if !verbose_ > 2 then
Printf.eprintf "error when fetching %s:\n %s\n%!" (Uri.to_string uri) msg; Printf.eprintf "error when fetching %s:\n %s\n%!"
); (Uri.to_string uri) msg;
self.bad <- uri :: self.bad; (* bad URL! *) self.bad <- uri :: self.bad;
Lwt.return () (* bad URL! *)
end Lwt.return ())
>>= loop (* recurse *) >>= loop (* recurse *)
) )
in in
loop() loop ()
let run (self:t) : _ Lwt.t = let run (self : t) : _ Lwt.t =
Printf.printf "run %d jobs…\ndomain(s): [%s]\n%!" self.j Printf.printf "run %d jobs…\ndomain(s): [%s]\n%!" self.j
(String.concat "," @@ Str_set.elements self.domains); (String.concat "," @@ Str_set.elements self.domains);
let workers = CCList.init self.j (fun _ -> worker self) in let workers = CCList.init self.j (fun _ -> worker self) in
(* wait for all workers to be done *) (* wait for all workers to be done *)
Lwt.join workers >|= fun () -> Lwt.join workers >|= fun () -> self.bad, self.n, Queue.length self.tasks
self.bad, self.n, Queue.length self.tasks
end end
let help_str = let help_str =
{|A web crawler that can typically be found in Texas. {|A web crawler that can typically be found in Texas.
usage: argiope url [url*] [option*] usage: argiope url [url*] [option*]
|} |}
@ -132,34 +145,41 @@ let () =
let start = ref [] in let start = ref [] in
let j = ref 20 in let j = ref 20 in
let max_ = ref ~-1 in let max_ = ref ~-1 in
let opts = [ let opts =
[
"-v", Arg.Unit (fun _ -> incr verbose_), " verbose"; "-v", Arg.Unit (fun _ -> incr verbose_), " verbose";
"--domain", Arg.String (fun s -> domains := Str_set.add s !domains), " include given domain"; ( "--domain",
"-d", Arg.String (fun s -> domains := Str_set.add s !domains), " alias to --domainm"; Arg.String (fun s -> domains := Str_set.add s !domains),
" include given domain" );
( "-d",
Arg.String (fun s -> domains := Str_set.add s !domains),
" alias to --domainm" );
"--max", Arg.Set_int max_, " max number of pages to explore"; "--max", Arg.Set_int max_, " max number of pages to explore";
"-j", Arg.Set_int j, " number of jobs (default 20)"; "-j", Arg.Set_int j, " number of jobs (default 20)";
] |> Arg.align in ]
|> Arg.align
in
Arg.parse opts (CCList.Ref.push start) help_str; Arg.parse opts (CCList.Ref.push start) help_str;
if !start = [] then ( if !start = [] then
Arg.usage opts help_str; Arg.usage opts help_str
) else ( else (
let start = List.map Uri.of_string !start in let start = List.map Uri.of_string !start in
let default_host = match Uri.host @@ List.hd start with let default_host =
match Uri.host @@ List.hd start with
| Some h -> h | Some h -> h
| _ -> failwith "need absolute URIs" | _ -> failwith "need absolute URIs"
| exception _ -> failwith "need absolute URIs" | exception _ -> failwith "need absolute URIs"
in in
let run = let run = Run.make ~default_host ~j:!j ~domains:!domains ~max:!max_ start in
Run.make ~default_host ~j:!j ~domains:!domains ~max:!max_ start
in
(* crawl *) (* crawl *)
let bad, num, remaining = Lwt_main.run (Run.run run) in let bad, num, remaining = Lwt_main.run (Run.run run) in
if bad <> [] then ( if bad <> [] then (
Printf.printf "ERROR: crawled %d pages, %d dead links (%d remaining)\n" Printf.printf "ERROR: crawled %d pages, %d dead links (%d remaining)\n"
num (List.length bad) remaining; num (List.length bad) remaining;
List.iter (fun uri -> Printf.printf " dead: %s\n" (Uri.to_string uri)) bad; List.iter
(fun uri -> Printf.printf " dead: %s\n" (Uri.to_string uri))
bad;
exit 1 exit 1
) else ( ) else
Printf.printf "OK: crawled %d pages (remaining %d)\n" num remaining Printf.printf "OK: crawled %d pages (remaining %d)\n" num remaining
) )
)

View file

@ -1,4 +1,3 @@
(executable (executable
(name argiope) (name argiope)
(modes native) (modes native)

View file

@ -1,4 +1,3 @@
(library (library
(name ezcurl_core) (name ezcurl_core)
(public_name ezcurl.core) (public_name ezcurl.core)

View file

@ -246,20 +246,19 @@ module type S = sig
@param meth which method to use (see {!meth}) @param meth which method to use (see {!meth})
@param tries how many times to retry in case of [CURLE_AGAIN] code @param tries how many times to retry in case of [CURLE_AGAIN] code
@param client a client to reuse (instead of allocating a new one) @param client a client to reuse (instead of allocating a new one)
@param range an optional @param range
{{: https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests} byte range} an optional
to fetch (either to get large pages {{:https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests}
by chunks, or to resume an interrupted download). byte range} to fetch (either to get large pages by chunks, or to resume
an interrupted download).
@param config configuration to set @param config configuration to set
@param content the content to send as the query's body, either @param content
a [`String s] to write a single string, or [`Write f] the content to send as the query's body, either a [`String s] to write a
where [f] is a callback that is called on a buffer [b] with len [n] single string, or [`Write f] where [f] is a callback that is called on a
(as in [f b n]) and returns how many bytes it wrote in the buffer buffer [b] with len [n] (as in [f b n]) and returns how many bytes it
[b] starting at index [0] (at most [n] bytes). wrote in the buffer [b] starting at index [0] (at most [n] bytes). It
It must return [0] when the content is entirely written, and not must return [0] when the content is entirely written, and not before.
before. @param headers headers of the query *)
@param headers headers of the query
*)
(** Push-stream of bytes (** Push-stream of bytes
@since NEXT_RELEASE *) @since NEXT_RELEASE *)
@ -292,9 +291,7 @@ module type S = sig
url:string -> url:string ->
unit -> unit ->
(string response, Curl.curlCode * string) result io (string response, Curl.curlCode * string) result io
(** Shortcut for [http ~meth:GET] (** Shortcut for [http ~meth:GET] See {!http} for more info. *)
See {!http} for more info.
*)
val put : val put :
?tries:int -> ?tries:int ->
@ -305,9 +302,7 @@ module type S = sig
content:[ `String of string | `Write of bytes -> int -> int ] -> content:[ `String of string | `Write of bytes -> int -> int ] ->
unit -> unit ->
(string response, Curl.curlCode * string) result io (string response, Curl.curlCode * string) result io
(** Shortcut for [http ~meth:PUT] (** Shortcut for [http ~meth:PUT] See {!http} for more info. *)
See {!http} for more info.
*)
val post : val post :
?tries:int -> ?tries:int ->
@ -319,9 +314,7 @@ module type S = sig
url:string -> url:string ->
unit -> unit ->
(string response, Curl.curlCode * string) result io (string response, Curl.curlCode * string) result io
(** Shortcut for [http ~meth:(POST params)] (** Shortcut for [http ~meth:(POST params)] See {!http} for more info. *)
See {!http} for more info.
*)
end end
exception Parse_error of Curl.curlCode * string exception Parse_error of Curl.curlCode * string
@ -368,22 +361,21 @@ module Make (IO : IO) : S with type 'a io = 'a IO.t = struct
n := !n + len; n := !n + len;
r r
and seek i o = and seek i o =
begin match o with (match o with
| Curl.SEEK_SET -> n := Int64.to_int i | Curl.SEEK_SET -> n := Int64.to_int i
| SEEK_END -> n := String.length s + Int64.to_int i | SEEK_END -> n := String.length s + Int64.to_int i
| SEEK_CUR -> n := !n + Int64.to_int i | SEEK_CUR -> n := !n + Int64.to_int i);
end;
Curl.SEEKFUNC_OK Curl.SEEKFUNC_OK
in read, seek in
read, seek
| `Write f -> | `Write f ->
let buf = Bytes.create 1024 in let buf = Bytes.create 1024 in
let read i = let read i =
let len = min i (Bytes.length buf) in let len = min i (Bytes.length buf) in
let n = f buf len in let n = f buf len in
Bytes.sub_string buf i n Bytes.sub_string buf i n
and seek _ _ = and seek _ _ = Curl.SEEKFUNC_CANTSEEK in
Curl.SEEKFUNC_CANTSEEK read, seek
in read, seek
let content_size_ = function let content_size_ = function
| `String s -> Some (String.length s) | `String s -> Some (String.length s)

View file

@ -16,8 +16,8 @@ module Config : sig
end end
type t = private { curl: Curl.t } [@@unboxed] type t = private { curl: Curl.t } [@@unboxed]
(** A client, i.e. a cURL instance. (** A client, i.e. a cURL instance. The wrapping record has been present since
The wrapping record has been present since NEXT_RELEASE *) NEXT_RELEASE *)
val make : val make :
?set_opts:(Curl.t -> unit) -> ?set_opts:(Curl.t -> unit) ->
@ -27,9 +27,12 @@ val make :
t t
(** Create a new client. (** Create a new client.
@param set_opts called before returning the client, to set options @param set_opts called before returning the client, to set options
@param cookiejar_file if provided, tell curl to use the given file path to store/load cookies (since NEXT_RELEASE) @param cookiejar_file
@param enable_session_cookies if provided, enable cookie handling in curl so it store/load cookies (since NEXT_RELEASE) if provided, tell curl to use the given file path to store/load cookies
*) (since NEXT_RELEASE)
@param enable_session_cookies
if provided, enable cookie handling in curl so it store/load cookies
(since NEXT_RELEASE) *)
val delete : t -> unit val delete : t -> unit
(** Delete the client. It cannot be used anymore. *) (** Delete the client. It cannot be used anymore. *)
@ -44,16 +47,16 @@ val set_no_signal : bool -> unit
(** Cookie handling. (** Cookie handling.
@since NEXT_RELEASE *) @since NEXT_RELEASE *)
module Cookies : sig module Cookies : sig
val flush_cookiejar : t -> unit val flush_cookiejar : t -> unit
(** If [cookiejar_file] was provided in {!make}, this flushes the current set of cookies (** If [cookiejar_file] was provided in {!make}, this flushes the current set
to the provided file. of cookies to the provided file.
@since NEXT_RELEASE *) @since NEXT_RELEASE *)
val reload_cookiejar : t -> unit val reload_cookiejar : t -> unit
(** If [cookiejar_file] was provided in {!make}, this reloads cookies from (** If [cookiejar_file] was provided in {!make}, this reloads cookies from the
the provided file. provided file.
@since NEXT_RELEASE *) @since NEXT_RELEASE *)
val get_cookies : t -> string list val get_cookies : t -> string list
@ -72,11 +75,10 @@ end
type response_info = { type response_info = {
ri_response_time: float; ri_response_time: float;
(** Total time (in seconds) for the request/response pair. (** Total time (in seconds) for the request/response pair. See
See {!Curl.get_totaltime}. *) {!Curl.get_totaltime}. *)
ri_redirect_count: int; ri_redirect_count: int;
(** Number of redirects cURL followed. (** Number of redirects cURL followed. See {!Curl.get_redirectcount}. *)
See {!Curl.get_redirectcount}. *)
} }
(** Metadata about a response from the server. *) (** Metadata about a response from the server. *)
@ -85,7 +87,8 @@ val string_of_response_info : response_info -> string
type 'body response = { type 'body response = {
code: int; code: int;
(** Response code. See https://developer.mozilla.org/en-US/docs/Web/HTTP/Status *) (** Response code. See
https://developer.mozilla.org/en-US/docs/Web/HTTP/Status *)
headers: (string * string) list; (** Response headers *) headers: (string * string) list; (** Response headers *)
body: 'body; (** Response body, or [""] *) body: 'body; (** Response body, or [""] *)
info: response_info; (** Information about the response *) info: response_info; (** Information about the response *)
@ -98,7 +101,8 @@ val pp_response_with :
val pp_response : Format.formatter -> string response -> unit val pp_response : Format.formatter -> string response -> unit
val string_of_response : string response -> string val string_of_response : string response -> string
(** The {{: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods} HTTP method} (** The
{{:https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods} HTTP method}
to use *) to use *)
type meth = type meth =
| GET | GET
@ -145,20 +149,19 @@ module type S = sig
@param meth which method to use (see {!meth}) @param meth which method to use (see {!meth})
@param tries how many times to retry in case of [CURLE_AGAIN] code @param tries how many times to retry in case of [CURLE_AGAIN] code
@param client a client to reuse (instead of allocating a new one) @param client a client to reuse (instead of allocating a new one)
@param range an optional @param range
{{: https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests} byte range} an optional
to fetch (either to get large pages {{:https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests}
by chunks, or to resume an interrupted download). byte range} to fetch (either to get large pages by chunks, or to resume
an interrupted download).
@param config configuration to set @param config configuration to set
@param content the content to send as the query's body, either @param content
a [`String s] to write a single string, or [`Write f] the content to send as the query's body, either a [`String s] to write a
where [f] is a callback that is called on a buffer [b] with len [n] single string, or [`Write f] where [f] is a callback that is called on a
(as in [f b n]) and returns how many bytes it wrote in the buffer buffer [b] with len [n] (as in [f b n]) and returns how many bytes it
[b] starting at index [0] (at most [n] bytes). wrote in the buffer [b] starting at index [0] (at most [n] bytes). It
It must return [0] when the content is entirely written, and not must return [0] when the content is entirely written, and not before.
before. @param headers headers of the query *)
@param headers headers of the query
*)
(** Push-based stream of bytes (** Push-based stream of bytes
@since NEXT_RELEASE *) @since NEXT_RELEASE *)
@ -179,10 +182,9 @@ module type S = sig
write_into:#input_stream -> write_into:#input_stream ->
unit -> unit ->
(unit response, Curl.curlCode * string) result io (unit response, Curl.curlCode * string) result io
(** HTTP call via cURL, with a streaming response body. (** HTTP call via cURL, with a streaming response body. The body is given to
The body is given to [write_into] by chunks, [write_into] by chunks, then [write_into#on_close ()] is called and the
then [write_into#on_close ()] is called response is returned.
and the response is returned.
@since NEXT_RELEASE *) @since NEXT_RELEASE *)
val get : val get :
@ -194,9 +196,7 @@ module type S = sig
url:string -> url:string ->
unit -> unit ->
(string response, Curl.curlCode * string) result io (string response, Curl.curlCode * string) result io
(** Shortcut for [http ~meth:GET] (** Shortcut for [http ~meth:GET] See {!http} for more info. *)
See {!http} for more info.
*)
val put : val put :
?tries:int -> ?tries:int ->
@ -207,9 +207,7 @@ module type S = sig
content:[ `String of string | `Write of bytes -> int -> int ] -> content:[ `String of string | `Write of bytes -> int -> int ] ->
unit -> unit ->
(string response, Curl.curlCode * string) result io (string response, Curl.curlCode * string) result io
(** Shortcut for [http ~meth:PUT] (** Shortcut for [http ~meth:PUT] See {!http} for more info. *)
See {!http} for more info.
*)
val post : val post :
?tries:int -> ?tries:int ->
@ -221,9 +219,7 @@ module type S = sig
url:string -> url:string ->
unit -> unit ->
(string response, Curl.curlCode * string) result io (string response, Curl.curlCode * string) result io
(** Shortcut for [http ~meth:(POST params)] (** Shortcut for [http ~meth:(POST params)] See {!http} for more info. *)
See {!http} for more info.
*)
end end
module Make (IO : IO) : S with type 'a io = 'a IO.t module Make (IO : IO) : S with type 'a io = 'a IO.t

View file

@ -1,4 +1,3 @@
(library (library
(name ezcurl_lwt) (name ezcurl_lwt)
(public_name ezcurl-lwt) (public_name ezcurl-lwt)

View file

@ -16,4 +16,3 @@ include Ezcurl_core.Make (struct
Curl.CURLE_OK Curl.CURLE_OK
with Curl.CurlException (c, _, _) -> c with Curl.CurlException (c, _, _) -> c
end) end)