diff --git a/dune b/dune index 6e63ea7..c65a323 100644 --- a/dune +++ b/dune @@ -1,10 +1,9 @@ - (rule (alias runtest) (package ezcurl-lwt) - (deps (:file README.md)) + (deps + (:file README.md)) (action - (progn - (run ocaml-mdx test %{deps}) - (diff? %{file} %{file}.corrected)))) - + (progn + (run ocaml-mdx test %{deps}) + (diff? %{file} %{file}.corrected)))) diff --git a/examples/argiope/argiope.ml b/examples/argiope/argiope.ml index 5df5c7a..27cae09 100644 --- a/examples/argiope/argiope.ml +++ b/examples/argiope/argiope.ml @@ -1,9 +1,11 @@ open Lwt.Infix -module Str_set = CCSet.Make(String) -module Uri_tbl = CCHashtbl.Make(struct - include Uri - let hash u = Hashtbl.hash (to_string u) - end) +module Str_set = CCSet.Make (String) + +module Uri_tbl = CCHashtbl.Make (struct + include Uri + + let hash u = Hashtbl.hash (to_string u) +end) let verbose_ = ref 0 @@ -19,7 +21,7 @@ module Run = struct j: int; } - let push_task (self:t) u : unit = + let push_task (self : t) u : unit = let u = Uri.canonicalize u in if not @@ Uri_tbl.mem self.seen u then ( Uri_tbl.add self.seen u (); @@ -31,98 +33,109 @@ module Run = struct (* include the domains of [start] in [domains] *) let domains = List.fold_left - (fun set uri -> match Uri.host uri with - | None -> set - | Some h -> Str_set.add h set) + (fun set uri -> + match Uri.host uri with + | None -> set + | Some h -> Str_set.add h set) domains start in - let self = { - domains; j; max; tasks; default_host; seen=Uri_tbl.create 256; - bad=[]; n=0; - } in + let self = + { + domains; + j; + max; + tasks; + default_host; + seen = Uri_tbl.create 256; + bad = []; + n = 0; + } + in List.iter (fun uri -> push_task self uri) start; self let bad_code c = c >= 400 - let find_urls (body:string) : Uri.t list = + let find_urls (body : string) : Uri.t list = let body = Soup.parse body in let open Soup.Infix in let nodes = body $$ "a[href]" in Soup.fold (fun l n -> - try - let url' = Soup.R.attribute "href" n in - Uri.of_string url' :: l - with _ -> l) + try + let url' = Soup.R.attribute "href" n in + Uri.of_string url' :: l + with _ -> l) [] nodes - let worker (self:t) : unit Lwt.t = + let worker (self : t) : unit Lwt.t = let client = Ezcurl_lwt.make () in - let rec loop() = - if Queue.is_empty self.tasks then Lwt.return () - else if self.max >= 0 && self.n > self.max then Lwt.return () + let rec loop () = + if Queue.is_empty self.tasks then + Lwt.return () + else if self.max >= 0 && self.n > self.max then + Lwt.return () else ( let uri = Queue.pop self.tasks in - if !verbose_>0 then Printf.eprintf "crawl %s\n%!" (Uri.to_string uri); + if !verbose_ > 0 then Printf.eprintf "crawl %s\n%!" (Uri.to_string uri); (* fetch URL (only 500kb) *) self.n <- 1 + self.n; - Ezcurl_lwt.get ~client ~range:"0-500000"~url:(Uri.to_string uri) () + Ezcurl_lwt.get ~client ~range:"0-500000" ~url:(Uri.to_string uri) () >>= fun resp -> - begin match resp with - | Ok {Ezcurl_lwt.code; body; _} -> - if bad_code code then ( - if !verbose_>1 then ( - Printf.eprintf "bad code when fetching %s: %d\n%!" (Uri.to_string uri) code; - ); - self.bad <- uri :: self.bad; (* bad URL! *) - ) else ( - (* if !verbose_ then Printf.eprintf "body for %s:\n%s\n" (Uri.to_string uri) body; *) - let cur_host = Uri.host_with_default ~default:self.default_host uri in - let uris = find_urls body in - List.iter - (fun uri' -> - match Uri.host uri' with - | Some h when Str_set.mem h self.domains -> - (* follow this link *) - if !verbose_>1 then Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri'); - push_task self uri' - | Some _ -> () - | None -> - (* relative URL, make it absolute *) - let uri' = Uri.with_host uri' (Some cur_host) in - let uri' = Uri.with_scheme uri' (Uri.scheme uri) in - if !verbose_>1 then Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri'); - push_task self uri' - ) - uris; - - ); - Lwt.return () - | Error (_, msg) -> - if !verbose_>2 then ( - Printf.eprintf "error when fetching %s:\n %s\n%!" (Uri.to_string uri) msg; - ); - self.bad <- uri :: self.bad; (* bad URL! *) - Lwt.return () - end + (match resp with + | Ok { Ezcurl_lwt.code; body; _ } -> + if bad_code code then ( + if !verbose_ > 1 then + Printf.eprintf "bad code when fetching %s: %d\n%!" + (Uri.to_string uri) code; + self.bad <- uri :: self.bad (* bad URL! *) + ) else ( + (* if !verbose_ then Printf.eprintf "body for %s:\n%s\n" (Uri.to_string uri) body; *) + let cur_host = + Uri.host_with_default ~default:self.default_host uri + in + let uris = find_urls body in + List.iter + (fun uri' -> + match Uri.host uri' with + | Some h when Str_set.mem h self.domains -> + (* follow this link *) + if !verbose_ > 1 then + Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri'); + push_task self uri' + | Some _ -> () + | None -> + (* relative URL, make it absolute *) + let uri' = Uri.with_host uri' (Some cur_host) in + let uri' = Uri.with_scheme uri' (Uri.scheme uri) in + if !verbose_ > 1 then + Printf.eprintf "follow link to %s\n%!" (Uri.to_string uri'); + push_task self uri') + uris + ); + Lwt.return () + | Error (_, msg) -> + if !verbose_ > 2 then + Printf.eprintf "error when fetching %s:\n %s\n%!" + (Uri.to_string uri) msg; + self.bad <- uri :: self.bad; + (* bad URL! *) + Lwt.return ()) >>= loop (* recurse *) ) in - loop() + loop () - let run (self:t) : _ Lwt.t = + let run (self : t) : _ Lwt.t = Printf.printf "run %d jobs…\ndomain(s): [%s]\n%!" self.j (String.concat "," @@ Str_set.elements self.domains); let workers = CCList.init self.j (fun _ -> worker self) in (* wait for all workers to be done *) - Lwt.join workers >|= fun () -> - self.bad, self.n, Queue.length self.tasks + Lwt.join workers >|= fun () -> self.bad, self.n, Queue.length self.tasks end - let help_str = -{|A web crawler that can typically be found in Texas. + {|A web crawler that can typically be found in Texas. usage: argiope url [url*] [option*] |} @@ -132,34 +145,41 @@ let () = let start = ref [] in let j = ref 20 in let max_ = ref ~-1 in - let opts = [ - "-v", Arg.Unit (fun _ -> incr verbose_), " verbose"; - "--domain", Arg.String (fun s -> domains := Str_set.add s !domains), " include given domain"; - "-d", Arg.String (fun s -> domains := Str_set.add s !domains), " alias to --domainm"; - "--max", Arg.Set_int max_, " max number of pages to explore"; - "-j", Arg.Set_int j, " number of jobs (default 20)"; - ] |> Arg.align in + let opts = + [ + "-v", Arg.Unit (fun _ -> incr verbose_), " verbose"; + ( "--domain", + Arg.String (fun s -> domains := Str_set.add s !domains), + " include given domain" ); + ( "-d", + Arg.String (fun s -> domains := Str_set.add s !domains), + " alias to --domainm" ); + "--max", Arg.Set_int max_, " max number of pages to explore"; + "-j", Arg.Set_int j, " number of jobs (default 20)"; + ] + |> Arg.align + in Arg.parse opts (CCList.Ref.push start) help_str; - if !start = [] then ( - Arg.usage opts help_str; - ) else ( + if !start = [] then + Arg.usage opts help_str + else ( let start = List.map Uri.of_string !start in - let default_host = match Uri.host @@ List.hd start with + let default_host = + match Uri.host @@ List.hd start with | Some h -> h | _ -> failwith "need absolute URIs" | exception _ -> failwith "need absolute URIs" in - let run = - Run.make ~default_host ~j:!j ~domains:!domains ~max:!max_ start - in + let run = Run.make ~default_host ~j:!j ~domains:!domains ~max:!max_ start in (* crawl *) let bad, num, remaining = Lwt_main.run (Run.run run) in if bad <> [] then ( Printf.printf "ERROR: crawled %d pages, %d dead links (%d remaining)\n" num (List.length bad) remaining; - List.iter (fun uri -> Printf.printf " dead: %s\n" (Uri.to_string uri)) bad; + List.iter + (fun uri -> Printf.printf " dead: %s\n" (Uri.to_string uri)) + bad; exit 1 - ) else ( + ) else Printf.printf "OK: crawled %d pages (remaining %d)\n" num remaining - ) ) diff --git a/examples/argiope/dune b/examples/argiope/dune index dcc3da5..f84edb4 100644 --- a/examples/argiope/dune +++ b/examples/argiope/dune @@ -1,5 +1,4 @@ - (executable - (name argiope) - (modes native) - (libraries uri lambdasoup lwt ezcurl-lwt containers)) + (name argiope) + (modes native) + (libraries uri lambdasoup lwt ezcurl-lwt containers)) diff --git a/src/core/dune b/src/core/dune index c622c40..4fd861b 100644 --- a/src/core/dune +++ b/src/core/dune @@ -1,4 +1,3 @@ - (library (name ezcurl_core) (public_name ezcurl.core) diff --git a/src/core/ezcurl_core.ml b/src/core/ezcurl_core.ml index 01b67f6..aff44d0 100644 --- a/src/core/ezcurl_core.ml +++ b/src/core/ezcurl_core.ml @@ -246,20 +246,19 @@ module type S = sig @param meth which method to use (see {!meth}) @param tries how many times to retry in case of [CURLE_AGAIN] code @param client a client to reuse (instead of allocating a new one) - @param range an optional - {{: https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests} byte range} - to fetch (either to get large pages - by chunks, or to resume an interrupted download). + @param range + an optional + {{:https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests} + byte range} to fetch (either to get large pages by chunks, or to resume + an interrupted download). @param config configuration to set - @param content the content to send as the query's body, either - a [`String s] to write a single string, or [`Write f] - where [f] is a callback that is called on a buffer [b] with len [n] - (as in [f b n]) and returns how many bytes it wrote in the buffer - [b] starting at index [0] (at most [n] bytes). - It must return [0] when the content is entirely written, and not - before. - @param headers headers of the query - *) + @param content + the content to send as the query's body, either a [`String s] to write a + single string, or [`Write f] where [f] is a callback that is called on a + buffer [b] with len [n] (as in [f b n]) and returns how many bytes it + wrote in the buffer [b] starting at index [0] (at most [n] bytes). It + must return [0] when the content is entirely written, and not before. + @param headers headers of the query *) (** Push-stream of bytes @since NEXT_RELEASE *) @@ -292,9 +291,7 @@ module type S = sig url:string -> unit -> (string response, Curl.curlCode * string) result io - (** Shortcut for [http ~meth:GET] - See {!http} for more info. - *) + (** Shortcut for [http ~meth:GET] See {!http} for more info. *) val put : ?tries:int -> @@ -305,9 +302,7 @@ module type S = sig content:[ `String of string | `Write of bytes -> int -> int ] -> unit -> (string response, Curl.curlCode * string) result io - (** Shortcut for [http ~meth:PUT] - See {!http} for more info. - *) + (** Shortcut for [http ~meth:PUT] See {!http} for more info. *) val post : ?tries:int -> @@ -319,9 +314,7 @@ module type S = sig url:string -> unit -> (string response, Curl.curlCode * string) result io - (** Shortcut for [http ~meth:(POST params)] - See {!http} for more info. - *) + (** Shortcut for [http ~meth:(POST params)] See {!http} for more info. *) end exception Parse_error of Curl.curlCode * string @@ -368,22 +361,21 @@ module Make (IO : IO) : S with type 'a io = 'a IO.t = struct n := !n + len; r and seek i o = - begin match o with + (match o with | Curl.SEEK_SET -> n := Int64.to_int i | SEEK_END -> n := String.length s + Int64.to_int i - | SEEK_CUR -> n := !n + Int64.to_int i - end; + | SEEK_CUR -> n := !n + Int64.to_int i); Curl.SEEKFUNC_OK - in read, seek + in + read, seek | `Write f -> let buf = Bytes.create 1024 in let read i = let len = min i (Bytes.length buf) in let n = f buf len in Bytes.sub_string buf i n - and seek _ _ = - Curl.SEEKFUNC_CANTSEEK - in read, seek + and seek _ _ = Curl.SEEKFUNC_CANTSEEK in + read, seek let content_size_ = function | `String s -> Some (String.length s) diff --git a/src/core/ezcurl_core.mli b/src/core/ezcurl_core.mli index 1a3620a..8d2ac92 100644 --- a/src/core/ezcurl_core.mli +++ b/src/core/ezcurl_core.mli @@ -16,8 +16,8 @@ module Config : sig end type t = private { curl: Curl.t } [@@unboxed] -(** A client, i.e. a cURL instance. - The wrapping record has been present since NEXT_RELEASE *) +(** A client, i.e. a cURL instance. The wrapping record has been present since + NEXT_RELEASE *) val make : ?set_opts:(Curl.t -> unit) -> @@ -26,10 +26,13 @@ val make : unit -> t (** Create a new client. - @param set_opts called before returning the client, to set options - @param cookiejar_file if provided, tell curl to use the given file path to store/load cookies (since NEXT_RELEASE) - @param enable_session_cookies if provided, enable cookie handling in curl so it store/load cookies (since NEXT_RELEASE) - *) + @param set_opts called before returning the client, to set options + @param cookiejar_file + if provided, tell curl to use the given file path to store/load cookies + (since NEXT_RELEASE) + @param enable_session_cookies + if provided, enable cookie handling in curl so it store/load cookies + (since NEXT_RELEASE) *) val delete : t -> unit (** Delete the client. It cannot be used anymore. *) @@ -44,17 +47,17 @@ val set_no_signal : bool -> unit (** Cookie handling. -@since NEXT_RELEASE *) + @since NEXT_RELEASE *) module Cookies : sig val flush_cookiejar : t -> unit - (** If [cookiejar_file] was provided in {!make}, this flushes the current set of cookies - to the provided file. - @since NEXT_RELEASE *) + (** If [cookiejar_file] was provided in {!make}, this flushes the current set + of cookies to the provided file. + @since NEXT_RELEASE *) val reload_cookiejar : t -> unit - (** If [cookiejar_file] was provided in {!make}, this reloads cookies from - the provided file. - @since NEXT_RELEASE *) + (** If [cookiejar_file] was provided in {!make}, this reloads cookies from the + provided file. + @since NEXT_RELEASE *) val get_cookies : t -> string list (** Get cookie list (in netscape format) *) @@ -72,11 +75,10 @@ end type response_info = { ri_response_time: float; - (** Total time (in seconds) for the request/response pair. - See {!Curl.get_totaltime}. *) + (** Total time (in seconds) for the request/response pair. See + {!Curl.get_totaltime}. *) ri_redirect_count: int; - (** Number of redirects cURL followed. - See {!Curl.get_redirectcount}. *) + (** Number of redirects cURL followed. See {!Curl.get_redirectcount}. *) } (** Metadata about a response from the server. *) @@ -85,7 +87,8 @@ val string_of_response_info : response_info -> string type 'body response = { code: int; - (** Response code. See https://developer.mozilla.org/en-US/docs/Web/HTTP/Status *) + (** Response code. See + https://developer.mozilla.org/en-US/docs/Web/HTTP/Status *) headers: (string * string) list; (** Response headers *) body: 'body; (** Response body, or [""] *) info: response_info; (** Information about the response *) @@ -98,8 +101,9 @@ val pp_response_with : val pp_response : Format.formatter -> string response -> unit val string_of_response : string response -> string -(** The {{: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods} HTTP method} - to use *) +(** The + {{:https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods} HTTP method} + to use *) type meth = | GET | POST of Curl.curlHTTPPost list @@ -145,20 +149,19 @@ module type S = sig @param meth which method to use (see {!meth}) @param tries how many times to retry in case of [CURLE_AGAIN] code @param client a client to reuse (instead of allocating a new one) - @param range an optional - {{: https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests} byte range} - to fetch (either to get large pages - by chunks, or to resume an interrupted download). + @param range + an optional + {{:https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests} + byte range} to fetch (either to get large pages by chunks, or to resume + an interrupted download). @param config configuration to set - @param content the content to send as the query's body, either - a [`String s] to write a single string, or [`Write f] - where [f] is a callback that is called on a buffer [b] with len [n] - (as in [f b n]) and returns how many bytes it wrote in the buffer - [b] starting at index [0] (at most [n] bytes). - It must return [0] when the content is entirely written, and not - before. - @param headers headers of the query - *) + @param content + the content to send as the query's body, either a [`String s] to write a + single string, or [`Write f] where [f] is a callback that is called on a + buffer [b] with len [n] (as in [f b n]) and returns how many bytes it + wrote in the buffer [b] starting at index [0] (at most [n] bytes). It + must return [0] when the content is entirely written, and not before. + @param headers headers of the query *) (** Push-based stream of bytes @since NEXT_RELEASE *) @@ -179,10 +182,9 @@ module type S = sig write_into:#input_stream -> unit -> (unit response, Curl.curlCode * string) result io - (** HTTP call via cURL, with a streaming response body. - The body is given to [write_into] by chunks, - then [write_into#on_close ()] is called - and the response is returned. + (** HTTP call via cURL, with a streaming response body. The body is given to + [write_into] by chunks, then [write_into#on_close ()] is called and the + response is returned. @since NEXT_RELEASE *) val get : @@ -194,9 +196,7 @@ module type S = sig url:string -> unit -> (string response, Curl.curlCode * string) result io - (** Shortcut for [http ~meth:GET] - See {!http} for more info. - *) + (** Shortcut for [http ~meth:GET] See {!http} for more info. *) val put : ?tries:int -> @@ -207,9 +207,7 @@ module type S = sig content:[ `String of string | `Write of bytes -> int -> int ] -> unit -> (string response, Curl.curlCode * string) result io - (** Shortcut for [http ~meth:PUT] - See {!http} for more info. - *) + (** Shortcut for [http ~meth:PUT] See {!http} for more info. *) val post : ?tries:int -> @@ -221,9 +219,7 @@ module type S = sig url:string -> unit -> (string response, Curl.curlCode * string) result io - (** Shortcut for [http ~meth:(POST params)] - See {!http} for more info. - *) + (** Shortcut for [http ~meth:(POST params)] See {!http} for more info. *) end module Make (IO : IO) : S with type 'a io = 'a IO.t diff --git a/src/lwt/dune b/src/lwt/dune index 2e6678f..4f0a948 100644 --- a/src/lwt/dune +++ b/src/lwt/dune @@ -1,4 +1,3 @@ - (library (name ezcurl_lwt) (public_name ezcurl-lwt) diff --git a/src/sync/ezcurl.ml b/src/sync/ezcurl.ml index 5baca81..22eaccc 100644 --- a/src/sync/ezcurl.ml +++ b/src/sync/ezcurl.ml @@ -16,4 +16,3 @@ include Ezcurl_core.Make (struct Curl.CURLE_OK with Curl.CurlException (c, _, _) -> c end) - diff --git a/test/dune b/test/dune index 13eec6a..9d50827 100644 --- a/test/dune +++ b/test/dune @@ -1,3 +1,3 @@ (test - (name basic_test) - (libraries ezcurl)) + (name basic_test) + (libraries ezcurl))