mirror of
https://github.com/c-cube/ezcurl.git
synced 2026-01-21 08:46:44 -05:00
reformat with 0.27
This commit is contained in:
parent
2b6297c075
commit
69b73e0f69
9 changed files with 177 additions and 174 deletions
5
dune
5
dune
|
|
@ -1,10 +1,9 @@
|
|||
|
||||
(rule
|
||||
(alias runtest)
|
||||
(package ezcurl-lwt)
|
||||
(deps (:file README.md))
|
||||
(deps
|
||||
(:file README.md))
|
||||
(action
|
||||
(progn
|
||||
(run ocaml-mdx test %{deps})
|
||||
(diff? %{file} %{file}.corrected))))
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,9 @@
|
|||
open Lwt.Infix
|
||||
module Str_set = CCSet.Make (String)
|
||||
|
||||
module Uri_tbl = CCHashtbl.Make (struct
|
||||
include Uri
|
||||
|
||||
let hash u = Hashtbl.hash (to_string u)
|
||||
end)
|
||||
|
||||
|
|
@ -31,15 +33,24 @@ module Run = struct
|
|||
(* include the domains of [start] in [domains] *)
|
||||
let domains =
|
||||
List.fold_left
|
||||
(fun set uri -> match Uri.host uri with
|
||||
(fun set uri ->
|
||||
match Uri.host uri with
|
||||
| None -> set
|
||||
| Some h -> Str_set.add h set)
|
||||
domains start
|
||||
in
|
||||
let self = {
|
||||
domains; j; max; tasks; default_host; seen=Uri_tbl.create 256;
|
||||
bad=[]; n=0;
|
||||
} in
|
||||
let self =
|
||||
{
|
||||
domains;
|
||||
j;
|
||||
max;
|
||||
tasks;
|
||||
default_host;
|
||||
seen = Uri_tbl.create 256;
|
||||
bad = [];
|
||||
n = 0;
|
||||
}
|
||||
in
|
||||
List.iter (fun uri -> push_task self uri) start;
|
||||
self
|
||||
|
||||
|
|
@ -60,8 +71,10 @@ module Run = struct
|
|||
let worker (self : t) : unit Lwt.t =
|
||||
let client = Ezcurl_lwt.make () in
|
||||
let rec loop () =
|
||||
if Queue.is_empty self.tasks then Lwt.return ()
|
||||
else if self.max >= 0 && self.n > self.max then Lwt.return ()
|
||||
if Queue.is_empty self.tasks then
|
||||
Lwt.return ()
|
||||
else if self.max >= 0 && self.n > self.max then
|
||||
Lwt.return ()
|
||||
else (
|
||||
let uri = Queue.pop self.tasks in
|
||||
if !verbose_ > 0 then Printf.eprintf "crawl %s\n%!" (Uri.to_string uri);
|
||||
|
|
@ -69,43 +82,45 @@ module Run = struct
|
|||
self.n <- 1 + self.n;
|
||||
Ezcurl_lwt.get ~client ~range:"0-500000" ~url:(Uri.to_string uri) ()
|
||||
>>= fun resp ->
|
||||
begin match resp with
|
||||
(match resp with
|
||||
| Ok { Ezcurl_lwt.code; body; _ } ->
|
||||
if bad_code code then (
|
||||
if !verbose_>1 then (
|
||||
Printf.eprintf "bad code when fetching %s: %d\n%!" (Uri.to_string uri) code;
|
||||
);
|
||||
self.bad <- uri :: self.bad; (* bad URL! *)
|
||||
if !verbose_ > 1 then
|
||||
Printf.eprintf "bad code when fetching %s: %d\n%!"
|
||||
(Uri.to_string uri) code;
|
||||
self.bad <- uri :: self.bad (* bad URL! *)
|
||||
) else (
|
||||
(* if !verbose_ then Printf.eprintf "body for %s:\n%s\n" (Uri.to_string uri) body; *)
|
||||
let cur_host = Uri.host_with_default ~default:self.default_host uri in
|
||||
let cur_host =
|
||||
Uri.host_with_default ~default:self.default_host uri
|
||||
in
|
||||
let uris = find_urls body in
|
||||
List.iter
|
||||
(fun uri' ->
|
||||
match Uri.host uri' with
|
||||
| Some h when Str_set.mem h self.domains ->
|
||||
(* follow this link *)
|
||||
if !verbose_>1 then Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri');
|
||||
if !verbose_ > 1 then
|
||||
Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri');
|
||||
push_task self uri'
|
||||
| Some _ -> ()
|
||||
| None ->
|
||||
(* relative URL, make it absolute *)
|
||||
let uri' = Uri.with_host uri' (Some cur_host) in
|
||||
let uri' = Uri.with_scheme uri' (Uri.scheme uri) in
|
||||
if !verbose_>1 then Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri');
|
||||
push_task self uri'
|
||||
)
|
||||
uris;
|
||||
|
||||
if !verbose_ > 1 then
|
||||
Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri');
|
||||
push_task self uri')
|
||||
uris
|
||||
);
|
||||
Lwt.return ()
|
||||
| Error (_, msg) ->
|
||||
if !verbose_>2 then (
|
||||
Printf.eprintf "error when fetching %s:\n %s\n%!" (Uri.to_string uri) msg;
|
||||
);
|
||||
self.bad <- uri :: self.bad; (* bad URL! *)
|
||||
Lwt.return ()
|
||||
end
|
||||
if !verbose_ > 2 then
|
||||
Printf.eprintf "error when fetching %s:\n %s\n%!"
|
||||
(Uri.to_string uri) msg;
|
||||
self.bad <- uri :: self.bad;
|
||||
(* bad URL! *)
|
||||
Lwt.return ())
|
||||
>>= loop (* recurse *)
|
||||
)
|
||||
in
|
||||
|
|
@ -116,11 +131,9 @@ module Run = struct
|
|||
(String.concat "," @@ Str_set.elements self.domains);
|
||||
let workers = CCList.init self.j (fun _ -> worker self) in
|
||||
(* wait for all workers to be done *)
|
||||
Lwt.join workers >|= fun () ->
|
||||
self.bad, self.n, Queue.length self.tasks
|
||||
Lwt.join workers >|= fun () -> self.bad, self.n, Queue.length self.tasks
|
||||
end
|
||||
|
||||
|
||||
let help_str =
|
||||
{|A web crawler that can typically be found in Texas.
|
||||
|
||||
|
|
@ -132,34 +145,41 @@ let () =
|
|||
let start = ref [] in
|
||||
let j = ref 20 in
|
||||
let max_ = ref ~-1 in
|
||||
let opts = [
|
||||
let opts =
|
||||
[
|
||||
"-v", Arg.Unit (fun _ -> incr verbose_), " verbose";
|
||||
"--domain", Arg.String (fun s -> domains := Str_set.add s !domains), " include given domain";
|
||||
"-d", Arg.String (fun s -> domains := Str_set.add s !domains), " alias to --domainm";
|
||||
( "--domain",
|
||||
Arg.String (fun s -> domains := Str_set.add s !domains),
|
||||
" include given domain" );
|
||||
( "-d",
|
||||
Arg.String (fun s -> domains := Str_set.add s !domains),
|
||||
" alias to --domainm" );
|
||||
"--max", Arg.Set_int max_, " max number of pages to explore";
|
||||
"-j", Arg.Set_int j, " number of jobs (default 20)";
|
||||
] |> Arg.align in
|
||||
]
|
||||
|> Arg.align
|
||||
in
|
||||
Arg.parse opts (CCList.Ref.push start) help_str;
|
||||
if !start = [] then (
|
||||
Arg.usage opts help_str;
|
||||
) else (
|
||||
if !start = [] then
|
||||
Arg.usage opts help_str
|
||||
else (
|
||||
let start = List.map Uri.of_string !start in
|
||||
let default_host = match Uri.host @@ List.hd start with
|
||||
let default_host =
|
||||
match Uri.host @@ List.hd start with
|
||||
| Some h -> h
|
||||
| _ -> failwith "need absolute URIs"
|
||||
| exception _ -> failwith "need absolute URIs"
|
||||
in
|
||||
let run =
|
||||
Run.make ~default_host ~j:!j ~domains:!domains ~max:!max_ start
|
||||
in
|
||||
let run = Run.make ~default_host ~j:!j ~domains:!domains ~max:!max_ start in
|
||||
(* crawl *)
|
||||
let bad, num, remaining = Lwt_main.run (Run.run run) in
|
||||
if bad <> [] then (
|
||||
Printf.printf "ERROR: crawled %d pages, %d dead links (%d remaining)\n"
|
||||
num (List.length bad) remaining;
|
||||
List.iter (fun uri -> Printf.printf " dead: %s\n" (Uri.to_string uri)) bad;
|
||||
List.iter
|
||||
(fun uri -> Printf.printf " dead: %s\n" (Uri.to_string uri))
|
||||
bad;
|
||||
exit 1
|
||||
) else (
|
||||
) else
|
||||
Printf.printf "OK: crawled %d pages (remaining %d)\n" num remaining
|
||||
)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
(executable
|
||||
(name argiope)
|
||||
(modes native)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
(library
|
||||
(name ezcurl_core)
|
||||
(public_name ezcurl.core)
|
||||
|
|
|
|||
|
|
@ -246,20 +246,19 @@ module type S = sig
|
|||
@param meth which method to use (see {!meth})
|
||||
@param tries how many times to retry in case of [CURLE_AGAIN] code
|
||||
@param client a client to reuse (instead of allocating a new one)
|
||||
@param range an optional
|
||||
{{: https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests} byte range}
|
||||
to fetch (either to get large pages
|
||||
by chunks, or to resume an interrupted download).
|
||||
@param range
|
||||
an optional
|
||||
{{:https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests}
|
||||
byte range} to fetch (either to get large pages by chunks, or to resume
|
||||
an interrupted download).
|
||||
@param config configuration to set
|
||||
@param content the content to send as the query's body, either
|
||||
a [`String s] to write a single string, or [`Write f]
|
||||
where [f] is a callback that is called on a buffer [b] with len [n]
|
||||
(as in [f b n]) and returns how many bytes it wrote in the buffer
|
||||
[b] starting at index [0] (at most [n] bytes).
|
||||
It must return [0] when the content is entirely written, and not
|
||||
before.
|
||||
@param headers headers of the query
|
||||
*)
|
||||
@param content
|
||||
the content to send as the query's body, either a [`String s] to write a
|
||||
single string, or [`Write f] where [f] is a callback that is called on a
|
||||
buffer [b] with len [n] (as in [f b n]) and returns how many bytes it
|
||||
wrote in the buffer [b] starting at index [0] (at most [n] bytes). It
|
||||
must return [0] when the content is entirely written, and not before.
|
||||
@param headers headers of the query *)
|
||||
|
||||
(** Push-stream of bytes
|
||||
@since NEXT_RELEASE *)
|
||||
|
|
@ -292,9 +291,7 @@ module type S = sig
|
|||
url:string ->
|
||||
unit ->
|
||||
(string response, Curl.curlCode * string) result io
|
||||
(** Shortcut for [http ~meth:GET]
|
||||
See {!http} for more info.
|
||||
*)
|
||||
(** Shortcut for [http ~meth:GET] See {!http} for more info. *)
|
||||
|
||||
val put :
|
||||
?tries:int ->
|
||||
|
|
@ -305,9 +302,7 @@ module type S = sig
|
|||
content:[ `String of string | `Write of bytes -> int -> int ] ->
|
||||
unit ->
|
||||
(string response, Curl.curlCode * string) result io
|
||||
(** Shortcut for [http ~meth:PUT]
|
||||
See {!http} for more info.
|
||||
*)
|
||||
(** Shortcut for [http ~meth:PUT] See {!http} for more info. *)
|
||||
|
||||
val post :
|
||||
?tries:int ->
|
||||
|
|
@ -319,9 +314,7 @@ module type S = sig
|
|||
url:string ->
|
||||
unit ->
|
||||
(string response, Curl.curlCode * string) result io
|
||||
(** Shortcut for [http ~meth:(POST params)]
|
||||
See {!http} for more info.
|
||||
*)
|
||||
(** Shortcut for [http ~meth:(POST params)] See {!http} for more info. *)
|
||||
end
|
||||
|
||||
exception Parse_error of Curl.curlCode * string
|
||||
|
|
@ -368,22 +361,21 @@ module Make (IO : IO) : S with type 'a io = 'a IO.t = struct
|
|||
n := !n + len;
|
||||
r
|
||||
and seek i o =
|
||||
begin match o with
|
||||
(match o with
|
||||
| Curl.SEEK_SET -> n := Int64.to_int i
|
||||
| SEEK_END -> n := String.length s + Int64.to_int i
|
||||
| SEEK_CUR -> n := !n + Int64.to_int i
|
||||
end;
|
||||
| SEEK_CUR -> n := !n + Int64.to_int i);
|
||||
Curl.SEEKFUNC_OK
|
||||
in read, seek
|
||||
in
|
||||
read, seek
|
||||
| `Write f ->
|
||||
let buf = Bytes.create 1024 in
|
||||
let read i =
|
||||
let len = min i (Bytes.length buf) in
|
||||
let n = f buf len in
|
||||
Bytes.sub_string buf i n
|
||||
and seek _ _ =
|
||||
Curl.SEEKFUNC_CANTSEEK
|
||||
in read, seek
|
||||
and seek _ _ = Curl.SEEKFUNC_CANTSEEK in
|
||||
read, seek
|
||||
|
||||
let content_size_ = function
|
||||
| `String s -> Some (String.length s)
|
||||
|
|
|
|||
|
|
@ -16,8 +16,8 @@ module Config : sig
|
|||
end
|
||||
|
||||
type t = private { curl: Curl.t } [@@unboxed]
|
||||
(** A client, i.e. a cURL instance.
|
||||
The wrapping record has been present since NEXT_RELEASE *)
|
||||
(** A client, i.e. a cURL instance. The wrapping record has been present since
|
||||
NEXT_RELEASE *)
|
||||
|
||||
val make :
|
||||
?set_opts:(Curl.t -> unit) ->
|
||||
|
|
@ -27,9 +27,12 @@ val make :
|
|||
t
|
||||
(** Create a new client.
|
||||
@param set_opts called before returning the client, to set options
|
||||
@param cookiejar_file if provided, tell curl to use the given file path to store/load cookies (since NEXT_RELEASE)
|
||||
@param enable_session_cookies if provided, enable cookie handling in curl so it store/load cookies (since NEXT_RELEASE)
|
||||
*)
|
||||
@param cookiejar_file
|
||||
if provided, tell curl to use the given file path to store/load cookies
|
||||
(since NEXT_RELEASE)
|
||||
@param enable_session_cookies
|
||||
if provided, enable cookie handling in curl so it store/load cookies
|
||||
(since NEXT_RELEASE) *)
|
||||
|
||||
val delete : t -> unit
|
||||
(** Delete the client. It cannot be used anymore. *)
|
||||
|
|
@ -47,13 +50,13 @@ val set_no_signal : bool -> unit
|
|||
@since NEXT_RELEASE *)
|
||||
module Cookies : sig
|
||||
val flush_cookiejar : t -> unit
|
||||
(** If [cookiejar_file] was provided in {!make}, this flushes the current set of cookies
|
||||
to the provided file.
|
||||
(** If [cookiejar_file] was provided in {!make}, this flushes the current set
|
||||
of cookies to the provided file.
|
||||
@since NEXT_RELEASE *)
|
||||
|
||||
val reload_cookiejar : t -> unit
|
||||
(** If [cookiejar_file] was provided in {!make}, this reloads cookies from
|
||||
the provided file.
|
||||
(** If [cookiejar_file] was provided in {!make}, this reloads cookies from the
|
||||
provided file.
|
||||
@since NEXT_RELEASE *)
|
||||
|
||||
val get_cookies : t -> string list
|
||||
|
|
@ -72,11 +75,10 @@ end
|
|||
|
||||
type response_info = {
|
||||
ri_response_time: float;
|
||||
(** Total time (in seconds) for the request/response pair.
|
||||
See {!Curl.get_totaltime}. *)
|
||||
(** Total time (in seconds) for the request/response pair. See
|
||||
{!Curl.get_totaltime}. *)
|
||||
ri_redirect_count: int;
|
||||
(** Number of redirects cURL followed.
|
||||
See {!Curl.get_redirectcount}. *)
|
||||
(** Number of redirects cURL followed. See {!Curl.get_redirectcount}. *)
|
||||
}
|
||||
(** Metadata about a response from the server. *)
|
||||
|
||||
|
|
@ -85,7 +87,8 @@ val string_of_response_info : response_info -> string
|
|||
|
||||
type 'body response = {
|
||||
code: int;
|
||||
(** Response code. See https://developer.mozilla.org/en-US/docs/Web/HTTP/Status *)
|
||||
(** Response code. See
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Status *)
|
||||
headers: (string * string) list; (** Response headers *)
|
||||
body: 'body; (** Response body, or [""] *)
|
||||
info: response_info; (** Information about the response *)
|
||||
|
|
@ -98,7 +101,8 @@ val pp_response_with :
|
|||
val pp_response : Format.formatter -> string response -> unit
|
||||
val string_of_response : string response -> string
|
||||
|
||||
(** The {{: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods} HTTP method}
|
||||
(** The
|
||||
{{:https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods} HTTP method}
|
||||
to use *)
|
||||
type meth =
|
||||
| GET
|
||||
|
|
@ -145,20 +149,19 @@ module type S = sig
|
|||
@param meth which method to use (see {!meth})
|
||||
@param tries how many times to retry in case of [CURLE_AGAIN] code
|
||||
@param client a client to reuse (instead of allocating a new one)
|
||||
@param range an optional
|
||||
{{: https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests} byte range}
|
||||
to fetch (either to get large pages
|
||||
by chunks, or to resume an interrupted download).
|
||||
@param range
|
||||
an optional
|
||||
{{:https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests}
|
||||
byte range} to fetch (either to get large pages by chunks, or to resume
|
||||
an interrupted download).
|
||||
@param config configuration to set
|
||||
@param content the content to send as the query's body, either
|
||||
a [`String s] to write a single string, or [`Write f]
|
||||
where [f] is a callback that is called on a buffer [b] with len [n]
|
||||
(as in [f b n]) and returns how many bytes it wrote in the buffer
|
||||
[b] starting at index [0] (at most [n] bytes).
|
||||
It must return [0] when the content is entirely written, and not
|
||||
before.
|
||||
@param headers headers of the query
|
||||
*)
|
||||
@param content
|
||||
the content to send as the query's body, either a [`String s] to write a
|
||||
single string, or [`Write f] where [f] is a callback that is called on a
|
||||
buffer [b] with len [n] (as in [f b n]) and returns how many bytes it
|
||||
wrote in the buffer [b] starting at index [0] (at most [n] bytes). It
|
||||
must return [0] when the content is entirely written, and not before.
|
||||
@param headers headers of the query *)
|
||||
|
||||
(** Push-based stream of bytes
|
||||
@since NEXT_RELEASE *)
|
||||
|
|
@ -179,10 +182,9 @@ module type S = sig
|
|||
write_into:#input_stream ->
|
||||
unit ->
|
||||
(unit response, Curl.curlCode * string) result io
|
||||
(** HTTP call via cURL, with a streaming response body.
|
||||
The body is given to [write_into] by chunks,
|
||||
then [write_into#on_close ()] is called
|
||||
and the response is returned.
|
||||
(** HTTP call via cURL, with a streaming response body. The body is given to
|
||||
[write_into] by chunks, then [write_into#on_close ()] is called and the
|
||||
response is returned.
|
||||
@since NEXT_RELEASE *)
|
||||
|
||||
val get :
|
||||
|
|
@ -194,9 +196,7 @@ module type S = sig
|
|||
url:string ->
|
||||
unit ->
|
||||
(string response, Curl.curlCode * string) result io
|
||||
(** Shortcut for [http ~meth:GET]
|
||||
See {!http} for more info.
|
||||
*)
|
||||
(** Shortcut for [http ~meth:GET] See {!http} for more info. *)
|
||||
|
||||
val put :
|
||||
?tries:int ->
|
||||
|
|
@ -207,9 +207,7 @@ module type S = sig
|
|||
content:[ `String of string | `Write of bytes -> int -> int ] ->
|
||||
unit ->
|
||||
(string response, Curl.curlCode * string) result io
|
||||
(** Shortcut for [http ~meth:PUT]
|
||||
See {!http} for more info.
|
||||
*)
|
||||
(** Shortcut for [http ~meth:PUT] See {!http} for more info. *)
|
||||
|
||||
val post :
|
||||
?tries:int ->
|
||||
|
|
@ -221,9 +219,7 @@ module type S = sig
|
|||
url:string ->
|
||||
unit ->
|
||||
(string response, Curl.curlCode * string) result io
|
||||
(** Shortcut for [http ~meth:(POST params)]
|
||||
See {!http} for more info.
|
||||
*)
|
||||
(** Shortcut for [http ~meth:(POST params)] See {!http} for more info. *)
|
||||
end
|
||||
|
||||
module Make (IO : IO) : S with type 'a io = 'a IO.t
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
(library
|
||||
(name ezcurl_lwt)
|
||||
(public_name ezcurl-lwt)
|
||||
|
|
|
|||
|
|
@ -16,4 +16,3 @@ include Ezcurl_core.Make (struct
|
|||
Curl.CURLE_OK
|
||||
with Curl.CurlException (c, _, _) -> c
|
||||
end)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue