(* https://github.com/open-telemetry/oteps/blob/main/text/0035-opentelemetry-protocol.md https://github.com/open-telemetry/oteps/blob/main/text/0099-otlp-http.md *) module OT = Opentelemetry open Opentelemetry include Common_ let needs_gc_metrics = Atomic.make false let gc_metrics = AList.make () (* side channel for GC, appended to {!E_metrics}'s data *) (* capture current GC metrics and push them into {!gc_metrics} for later collection *) let sample_gc_metrics () = Atomic.set needs_gc_metrics false; let l = OT.Metrics.make_resource_metrics ~attrs:(Opentelemetry.GC_metrics.get_runtime_attributes ()) @@ Opentelemetry.GC_metrics.get_metrics () in AList.add gc_metrics l module Config = Config let _init_curl = lazy (Curl.global_init Curl.CURLINIT_GLOBALALL; at_exit Curl.global_cleanup) type error = [ `Status of int * Opentelemetry.Proto.Status.status | `Failure of string | `Sysbreak ] let n_errors = Atomic.make 0 let n_dropped = Atomic.make 0 let report_err_ = function | `Sysbreak -> Printf.eprintf "opentelemetry: ctrl-c captured, stopping\n%!" | `Failure msg -> Format.eprintf "@[<2>opentelemetry: export failed: %s@]@." msg | `Status (code, { Opentelemetry.Proto.Status.code = scode; message; details }) -> let pp_details out l = List.iter (fun s -> Format.fprintf out "%S;@ " (Bytes.unsafe_to_string s)) l in Format.eprintf "@[<2>opentelemetry: export failed with@ http code=%d@ status \ {@[code=%ld;@ message=%S;@ details=[@[%a@]]@]}@]@." code scode (Bytes.unsafe_to_string message) pp_details details module Httpc : sig type t val create : unit -> t val send : t -> path:string -> decode:(Pbrt.Decoder.t -> 'a) -> string -> ('a, error) result val cleanup : t -> unit end = struct open Opentelemetry.Proto let () = Lazy.force _init_curl (* TODO: use Curl.Multi, etc. instead? *) type t = { buf_res: Buffer.t; curl: Curl.t; } let create () : t = { buf_res = Buffer.create 256; curl = Curl.init () } let cleanup self = Curl.cleanup self.curl (* send the content to the remote endpoint/path *) let send (self : t) ~path ~decode (bod : string) : ('a, error) result = let { curl; buf_res } = self in Curl.reset curl; if !debug_ then Curl.set_verbose curl true; Curl.set_url curl (!url ^ path); Curl.set_httppost curl []; let to_http_header (k, v) = Printf.sprintf "%s: %s" k v in let http_headers = List.map to_http_header !headers in Curl.set_httpheader curl ("Content-Type: application/x-protobuf" :: http_headers); (* write body *) Curl.set_post curl true; Curl.set_postfieldsize curl (String.length bod); Curl.set_readfunction curl (let i = ref 0 in fun n -> if !debug_ then Printf.eprintf "curl asks for %d bytes\n%!" n; let len = min n (String.length bod - !i) in let s = String.sub bod !i len in if !debug_ then Printf.eprintf "gave curl %d bytes\n%!" len; i := !i + len; s); (* read result's body *) Buffer.clear buf_res; Curl.set_writefunction curl (fun s -> Buffer.add_string buf_res s; String.length s); try match Curl.perform curl with | () -> let code = Curl.get_responsecode curl in if !debug_ then Printf.eprintf "result body: %S\n%!" (Buffer.contents buf_res); let dec = Pbrt.Decoder.of_string (Buffer.contents buf_res) in if code >= 200 && code < 300 then ( let res = decode dec in Ok res ) else ( let status = Status.decode_status dec in Error (`Status (code, status)) ) | exception Sys.Break -> Error `Sysbreak | exception Curl.CurlException (_, code, msg) -> let status = Status.default_status ~code:(Int32.of_int code) ~message:(Bytes.unsafe_of_string msg) () in Error (`Status (code, status)) with | Sys.Break -> Error `Sysbreak | e -> Error (`Failure (Printexc.to_string e)) end module type BATCH = sig end (** Batch of resources to be pushed later. This type is thread-safe. *) module Batch : sig type 'a t val push : 'a t -> 'a -> bool (** [push batch x] pushes [x] into the batch, and heuristically returns [true] if the batch is ready to be emitted (to know if we should wake up the sending thread, if any) *) val push' : 'a t -> 'a -> unit val is_ready : now:Mtime.t -> _ t -> bool (** is the batch ready to be sent? This is heuristic. *) val pop_if_ready : ?force:bool -> now:Mtime.t -> 'a t -> 'a list option (** Is the batch ready to be emitted? If batching is disabled, this is true as soon as {!is_empty} is false. If a timeout is provided for this batch, then it will be ready if an element has been in it for at least the timeout. @param now passed to implement timeout *) val make : ?batch:int -> ?timeout:Mtime.span -> unit -> 'a t (** Create a new batch *) end = struct type 'a t = { lock: Mutex.t; mutable size: int; mutable q: 'a list; batch: int option; high_watermark: int; timeout: Mtime.span option; mutable start: Mtime.t; } let make ?batch ?timeout () : _ t = Option.iter (fun b -> assert (b > 0)) batch; let high_watermark = Option.fold ~none:100 ~some:(fun x -> x * 10) batch in { lock = Mutex.create (); size = 0; start = Mtime_clock.now (); q = []; batch; timeout; high_watermark; } let is_empty_ self = self.size = 0 let timeout_expired_ ~now self : bool = match self.timeout with | Some t -> let elapsed = Mtime.span now self.start in Mtime.Span.compare elapsed t >= 0 | None -> false let is_full_ self : bool = match self.batch with | None -> self.size > 0 | Some b -> self.size >= b let is_ready ~now self : bool = let@ () = with_mutex_ self.lock in is_full_ self || timeout_expired_ ~now self let pop_if_ready ?(force = false) ~now (self : _ t) : _ list option = let@ () = with_mutex_ self.lock in if (force && not (is_empty_ self)) || is_full_ self || timeout_expired_ ~now self then ( let l = self.q in self.q <- []; self.size <- 0; Some l ) else None let push (self : _ t) x : bool = let@ () = with_mutex_ self.lock in if self.size >= self.high_watermark then ( (* drop this to prevent queue from growing too fast *) Atomic.incr n_dropped; true ) else ( if self.size = 0 && Option.is_some self.timeout then (* current batch starts now *) self.start <- Mtime_clock.now (); (* add to queue *) self.size <- 1 + self.size; self.q <- x :: self.q; let ready = is_full_ self in ready ) let push' self x = ignore (push self x : bool) end (** An emitter. This is used by {!Backend} below to forward traces/metrics/… from the program to whatever collector client we have. *) module type EMITTER = sig open Opentelemetry.Proto val push_trace : Trace.resource_spans list -> unit val push_metrics : Metrics.resource_metrics list -> unit val push_logs : Logs.resource_logs list -> unit val set_on_tick_callbacks : (unit -> unit) list ref -> unit val tick : unit -> unit val cleanup : unit -> unit end (* start a thread in the background, running [f()] *) let start_bg_thread (f : unit -> unit) : unit = let run () = (* block some signals: USR1 USR2 TERM PIPE ALARM STOP, see [$ kill -L] *) ignore (Thread.sigmask Unix.SIG_BLOCK [ 10; 12; 13; 14; 15; 19 ] : _ list); f () in ignore (Thread.create run () : Thread.t) (* make an emitter. exceptions inside should be caught, see https://opentelemetry.io/docs/reference/specification/error-handling/ *) let mk_emitter ~stop ~(config : Config.t) () : (module EMITTER) = let open Proto in (* local helpers *) let open struct let timeout = if config.batch_timeout_ms > 0 then Some Mtime.Span.(config.batch_timeout_ms * ms) else None let batch_traces : Trace.resource_spans list Batch.t = Batch.make ?batch:config.batch_traces ?timeout () let batch_metrics : Metrics.resource_metrics list Batch.t = Batch.make ?batch:config.batch_metrics ?timeout () let batch_logs : Logs.resource_logs list Batch.t = Batch.make ?batch:config.batch_logs ?timeout () let on_tick_cbs_ = Atomic.make (ref []) let set_on_tick_callbacks = Atomic.set on_tick_cbs_ let send_http_ (httpc : Httpc.t) encoder ~path ~encode x : unit = Pbrt.Encoder.reset encoder; encode x encoder; let data = Pbrt.Encoder.to_string encoder in match Httpc.send httpc ~path ~decode:(fun _ -> ()) data with | Ok () -> () | Error `Sysbreak -> Printf.eprintf "ctrl-c captured, stopping\n%!"; Atomic.set stop true | Error err -> (* TODO: log error _via_ otel? *) Atomic.incr n_errors; report_err_ err let send_metrics_http curl encoder (l : Metrics.resource_metrics list list) = let l = List.fold_left (fun acc l -> List.rev_append l acc) [] l in let x = Metrics_service.default_export_metrics_service_request ~resource_metrics:l () in send_http_ curl encoder ~path:"/v1/metrics" ~encode:Metrics_service.encode_export_metrics_service_request x let send_traces_http curl encoder (l : Trace.resource_spans list list) = let l = List.fold_left (fun acc l -> List.rev_append l acc) [] l in let x = Trace_service.default_export_trace_service_request ~resource_spans:l () in send_http_ curl encoder ~path:"/v1/traces" ~encode:Trace_service.encode_export_trace_service_request x let send_logs_http curl encoder (l : Logs.resource_logs list list) = let l = List.fold_left (fun acc l -> List.rev_append l acc) [] l in let x = Logs_service.default_export_logs_service_request ~resource_logs:l () in send_http_ curl encoder ~path:"/v1/logs" ~encode:Logs_service.encode_export_logs_service_request x (* emit metrics, if the batch is full or timeout lapsed *) let emit_metrics_maybe ~now ?force httpc encoder : bool = match Batch.pop_if_ready ?force ~now batch_metrics with | None -> false | Some l -> let batch = AList.pop_all gc_metrics :: l in send_metrics_http httpc encoder batch; true let emit_traces_maybe ~now ?force httpc encoder : bool = match Batch.pop_if_ready ?force ~now batch_traces with | None -> false | Some l -> send_traces_http httpc encoder l; true let emit_logs_maybe ~now ?force httpc encoder : bool = match Batch.pop_if_ready ?force ~now batch_logs with | None -> false | Some l -> send_logs_http httpc encoder l; true let[@inline] guard_exn_ where f = try f () with e -> Printf.eprintf "opentelemetry-curl: uncaught exception in %s: %s\n%!" where (Printexc.to_string e) let emit_all_force (httpc : Httpc.t) encoder = let now = Mtime_clock.now () in ignore (emit_traces_maybe ~now ~force:true httpc encoder : bool); ignore (emit_logs_maybe ~now ~force:true httpc encoder : bool); ignore (emit_metrics_maybe ~now ~force:true httpc encoder : bool) let tick_common_ () = if Atomic.get needs_gc_metrics then sample_gc_metrics (); List.iter (fun f -> try f () with e -> Printf.eprintf "on tick callback raised: %s\n" (Printexc.to_string e)) !(Atomic.get on_tick_cbs_); () let setup_ticker_thread ~tick ~finally () = (* thread that calls [tick()] regularly, to help enforce timeouts *) let tick_thread () = let@ () = Fun.protect ~finally:(fun () -> Atomic.set stop true; finally ()) in while not @@ Atomic.get stop do Thread.delay 0.5; tick () done in start_bg_thread tick_thread end in (let m = Mutex.create () in Lock.set_mutex ~lock:(fun () -> Mutex.lock m) ~unlock:(fun () -> Mutex.unlock m)); if config.bg_threads > 0 then ( let m = Mutex.create () in let cond = Condition.create () in (* loop for the thread that processes events and sends them to collector *) let bg_thread () = let httpc = Httpc.create () in let encoder = Pbrt.Encoder.create () in while not @@ Atomic.get stop do let@ () = guard_exn_ "bg thread (main loop)" in let now = Mtime_clock.now () in let do_metrics = emit_metrics_maybe ~now httpc encoder in let do_traces = emit_traces_maybe ~now httpc encoder in let do_logs = emit_logs_maybe ~now httpc encoder in if (not do_metrics) && (not do_traces) && not do_logs then (* wait for something to happen *) let@ () = with_mutex_ m in Condition.wait cond m done; (* flush remaining events once we exit *) let@ () = guard_exn_ "bg thread (cleanup)" in emit_all_force httpc encoder; Httpc.cleanup httpc in for _i = 1 to config.bg_threads do start_bg_thread bg_thread done; (* if the bg thread waits, this will wake it up so it can send batches *) let wakeup ~all () = with_mutex_ m (fun () -> if all then Condition.broadcast cond else Condition.signal cond); Thread.yield () in let tick () = tick_common_ (); let now = Mtime_clock.now () in if Atomic.get stop then wakeup ~all:true () else if Batch.is_ready ~now batch_metrics || Batch.is_ready ~now batch_traces || Batch.is_ready ~now batch_logs then wakeup ~all:false () in if config.ticker_thread then setup_ticker_thread ~tick ~finally:(fun () -> wakeup ~all:true ()) (); let module M = struct let push_trace e = if Batch.push batch_traces e then wakeup ~all:false () let push_metrics e = if Batch.push batch_metrics e then wakeup ~all:false () let push_logs e = if Batch.push batch_logs e then wakeup ~all:false () let set_on_tick_callbacks = set_on_tick_callbacks let tick = tick let cleanup () = Atomic.set stop true; if !debug_ then Printf.eprintf "opentelemetry: exiting…\n%!"; wakeup ~all:true () end in (module M) ) else ( let httpc = Httpc.create () in let encoder = Pbrt.Encoder.create () in let module M = struct (* we make sure that this is thread-safe, even though we don't have a background thread. There can still be a ticker thread, and there can also be several user threads that produce spans and call the emit functions. *) let push_trace e = let@ () = guard_exn_ "push trace" in Batch.push' batch_traces e; let now = Mtime_clock.now () in let@ () = Lock.with_lock in ignore (emit_traces_maybe ~now httpc encoder : bool) let push_metrics e = let@ () = guard_exn_ "push metrics" in if Atomic.get needs_gc_metrics then sample_gc_metrics (); Batch.push' batch_metrics e; let now = Mtime_clock.now () in let@ () = Lock.with_lock in ignore (emit_metrics_maybe ~now httpc encoder : bool) let push_logs e = let@ () = guard_exn_ "push logs" in Batch.push' batch_logs e; let now = Mtime_clock.now () in let@ () = Lock.with_lock in ignore (emit_logs_maybe ~now httpc encoder : bool) let set_on_tick_callbacks = set_on_tick_callbacks let tick () = if Atomic.get needs_gc_metrics then sample_gc_metrics (); let@ () = Lock.with_lock in let now = Mtime_clock.now () in ignore (emit_traces_maybe ~now httpc encoder : bool); ignore (emit_metrics_maybe ~now httpc encoder : bool); ignore (emit_logs_maybe ~now httpc encoder : bool); () (* make sure we have a ticker thread, if required *) let () = if config.ticker_thread then setup_ticker_thread ~tick ~finally:ignore () let cleanup () = if !debug_ then Printf.eprintf "opentelemetry: exiting…\n%!"; emit_all_force httpc encoder; Httpc.cleanup httpc end in (module M) ) module Backend (Arg : sig val stop : bool Atomic.t val config : Config.t end) () : Opentelemetry.Collector.BACKEND = struct include (val mk_emitter ~stop:Arg.stop ~config:Arg.config ()) open Opentelemetry.Proto open Opentelemetry.Collector let send_trace : Trace.resource_spans list sender = { send = (fun l ~ret -> (if !debug_ then let@ () = Lock.with_lock in Format.eprintf "send spans %a@." (Format.pp_print_list Trace.pp_resource_spans) l); push_trace l; ret ()); } let last_sent_metrics = Atomic.make (Mtime_clock.now ()) let timeout_sent_metrics = Mtime.Span.(5 * s) (* send metrics from time to time *) let signal_emit_gc_metrics () = Atomic.set needs_gc_metrics true let additional_metrics () : Metrics.resource_metrics list = (* add exporter metrics to the lot? *) let last_emit = Atomic.get last_sent_metrics in let now = Mtime_clock.now () in let add_own_metrics = let elapsed = Mtime.span last_emit now in Mtime.Span.compare elapsed timeout_sent_metrics > 0 in (* there is a possible race condition here, as several threads might update metrics at the same time. But that's harmless. *) if add_own_metrics then ( let open OT.Metrics in Atomic.set last_sent_metrics now; [ make_resource_metrics [ sum ~name:"otel.export.dropped" ~is_monotonic:true [ int ~start_time_unix_nano:(Mtime.to_uint64_ns last_emit) ~now:(Mtime.to_uint64_ns now) (Atomic.get n_dropped); ]; sum ~name:"otel.export.errors" ~is_monotonic:true [ int ~start_time_unix_nano:(Mtime.to_uint64_ns last_emit) ~now:(Mtime.to_uint64_ns now) (Atomic.get n_errors); ]; ]; ] ) else [] let send_metrics : Metrics.resource_metrics list sender = { send = (fun m ~ret -> (if !debug_ then let@ () = Lock.with_lock in Format.eprintf "send metrics %a@." (Format.pp_print_list Metrics.pp_resource_metrics) m); let m = List.rev_append (additional_metrics ()) m in push_metrics m; ret ()); } let send_logs : Logs.resource_logs list sender = { send = (fun m ~ret -> (if !debug_ then let@ () = Lock.with_lock in Format.eprintf "send logs %a@." (Format.pp_print_list Logs.pp_resource_logs) m); push_logs m; ret ()); } end let setup_ ?(stop = Atomic.make false) ~(config : Config.t) () = debug_ := config.debug; let module B = Backend (struct let stop = stop let config = config end) () in Opentelemetry.Collector.set_backend (module B); B.cleanup let setup ?stop ?(config = Config.make ()) ?(enable = true) () = if enable then ( let cleanup = setup_ ?stop ~config () in at_exit cleanup ) let with_setup ?stop ?(config = Config.make ()) ?(enable = true) () f = if enable then ( let cleanup = setup_ ?stop ~config () in Fun.protect ~finally:cleanup f ) else f ()