make Eio collector thread safe

The backend cannot take a switch, because switches cannot be shared
across domains, but the backend is accessed across domains from a global
variable.
This commit is contained in:
Shon Feder 2025-07-29 23:37:26 -04:00
parent 7cdadfaeeb
commit ddbdc80d57
No known key found for this signature in database
2 changed files with 117 additions and 135 deletions

View file

@ -136,7 +136,7 @@ end = struct
let create net = Httpc.make ~https:(Some (https ~authenticator)) net let create net = Httpc.make ~https:(Some (https ~authenticator)) net
(* send the content to the remote endpoint/path *) (* send the content to the remote endpoint/path *)
let send (client : t) ~url ~decode (bod : string) : ('a, error) result = let send (client : t) ~url ~decode (body : string) : ('a, error) result =
Switch.run @@ fun sw -> Switch.run @@ fun sw ->
let uri = Uri.of_string url in let uri = Uri.of_string url in
@ -146,7 +146,7 @@ end = struct
Header.(add headers "Content-Type" "application/x-protobuf") Header.(add headers "Content-Type" "application/x-protobuf")
in in
let body = Cohttp_eio.Body.of_string bod in let body = Cohttp_eio.Body.of_string body in
let r = let r =
try try
let r = Httpc.post client ~sw ~headers ~body uri in let r = Httpc.post client ~sw ~headers ~body uri in
@ -223,32 +223,16 @@ end
exceptions inside should be caught, see exceptions inside should be caught, see
https://opentelemetry.io/docs/reference/specification/error-handling/ *) https://opentelemetry.io/docs/reference/specification/error-handling/ *)
let mk_emitter ~sw ~stop ~(config : Config.t) ~(net : _ Eio.Net.t) () : let mk_emitter ~stop ~net (config : Config.t) : (module EMITTER) =
(module EMITTER) =
let open Proto in
(* local helpers *) (* local helpers *)
let open struct let open struct
let timeout = let client =
if config.batch_timeout_ms > 0 then (* Prime RNG state for TLS *)
Some Mtime.Span.(config.batch_timeout_ms * ms) Mirage_crypto_rng_unix.use_default ();
else Httpc.create net
None
let batch_traces : Trace.resource_spans Batch.t = let send_http ~url data : unit =
Batch.make ?batch:config.batch_traces ?timeout () let r = Httpc.send client ~url ~decode:(`Ret ()) data in
let batch_metrics : Metrics.resource_metrics Batch.t =
Batch.make ?batch:config.batch_metrics ?timeout ()
let batch_logs : Logs.resource_logs Batch.t =
Batch.make ?batch:config.batch_logs ?timeout ()
let on_tick_cbs_ = Atomic.make (AList.make ())
let set_on_tick_callbacks = Atomic.set on_tick_cbs_
let send_http_ (httpc : Httpc.t) ~url data : unit =
let r = Httpc.send httpc ~url ~decode:(`Ret ()) data in
match r with match r with
| Ok () -> () | Ok () -> ()
| Error `Sysbreak -> | Error `Sysbreak ->
@ -261,24 +245,25 @@ let mk_emitter ~sw ~stop ~(config : Config.t) ~(net : _ Eio.Net.t) () :
(* avoid crazy error loop *) (* avoid crazy error loop *)
Eio_unix.sleep 3. Eio_unix.sleep 3.
(* emit metrics, if the batch is full or timeout lapsed *) let timeout =
let emit_metrics_maybe ~now ?force client () = if config.batch_timeout_ms > 0 then
Batch.pop_if_ready ?force ~now batch_metrics Some Mtime.Span.(config.batch_timeout_ms * ms)
|> Option.iter (fun collected_metrics -> else
let gc_metrics = GC_metrics.drain () in None
gc_metrics @ collected_metrics
|> Signal.Encode.metrics
|> send_http_ client ~url:config.url_metrics)
let emit_traces_maybe ~now ?force client () = let batch_traces : Proto.Trace.resource_spans Batch.t =
Batch.pop_if_ready ?force ~now batch_traces Batch.make ?batch:config.batch_traces ?timeout ()
|> Option.iter (fun ts ->
Signal.Encode.traces ts |> send_http_ client ~url:config.url_traces)
let emit_logs_maybe ~now ?force client () = let batch_metrics : Proto.Metrics.resource_metrics Batch.t =
Batch.pop_if_ready ?force ~now batch_logs Batch.make ?batch:config.batch_metrics ?timeout ()
|> Option.iter (fun ls ->
Signal.Encode.logs ls |> send_http_ client ~url:config.url_logs) let batch_logs : Proto.Logs.resource_logs Batch.t =
Batch.make ?batch:config.batch_logs ?timeout ()
let push_to_batch b e =
match Batch.push b e with
| `Ok -> ()
| `Dropped -> Atomic.incr n_errors
let[@inline] guard_exn_ where f = let[@inline] guard_exn_ where f =
try f () try f ()
@ -287,104 +272,84 @@ let mk_emitter ~sw ~stop ~(config : Config.t) ~(net : _ Eio.Net.t) () :
Printf.eprintf "opentelemetry-eio: uncaught exception in %s: %s\n%s\n%!" Printf.eprintf "opentelemetry-eio: uncaught exception in %s: %s\n%s\n%!"
where (Printexc.to_string e) bt where (Printexc.to_string e) bt
let emit_all_force (httpc : Httpc.t) : unit = let push_traces x =
let now = Mtime_clock.now () in let@ () = guard_exn_ "push trace" in
Fiber.all push_to_batch batch_traces x
[
emit_logs_maybe ~now ~force:true httpc;
emit_metrics_maybe ~now ~force:true httpc;
emit_traces_maybe ~now ~force:true httpc;
]
let tick_common_ () = let push_metrics x =
if Config.Env.get_debug () then let@ () = guard_exn_ "push metrics" in
Printf.eprintf "tick (from %d)\n%!" (tid ());
sample_gc_metrics_if_needed (); sample_gc_metrics_if_needed ();
push_to_batch batch_metrics x
let push_logs x =
let@ () = guard_exn_ "push logs" in
push_to_batch batch_logs x
let maybe_emit (batch : 'a Batch.t) url (f : 'a list -> string) ~now ~force
() : unit =
Batch.pop_if_ready ~force ~now batch
|> Option.iter (fun signals -> f signals |> send_http ~url)
let emit_traces_maybe =
maybe_emit batch_traces config.url_traces Signal.Encode.traces
let emit_metrics_maybe =
maybe_emit batch_metrics config.url_metrics (fun collected_metrics ->
let gc_metrics = GC_metrics.drain () in
gc_metrics @ collected_metrics |> Signal.Encode.metrics)
let emit_logs_maybe =
maybe_emit batch_logs config.url_logs Signal.Encode.logs
let emit_all ~force : unit =
Switch.run @@ fun sw ->
let now = Mtime_clock.now () in
Fiber.fork ~sw @@ emit_logs_maybe ~now ~force;
Fiber.fork ~sw @@ emit_metrics_maybe ~now ~force;
Fiber.fork ~sw @@ emit_traces_maybe ~now ~force
let on_tick_cbs_ = Atomic.make (AList.make ())
let run_tick_callbacks () =
List.iter List.iter
(fun f -> (fun f ->
try f () try f ()
with e -> with e ->
Printf.eprintf "on tick callback raised: %s\n" Printf.eprintf "on tick callback raised: %s\n"
(Printexc.to_string e)) (Printexc.to_string e))
(AList.get @@ Atomic.get on_tick_cbs_); (AList.get @@ Atomic.get on_tick_cbs_)
()
(* thread that calls [tick()] regularly, to help enforce timeouts *)
let ticker_fiber ~tick : unit -> [ `Stop_daemon ] =
let rec loop () =
if Atomic.get stop then
`Stop_daemon
else (
tick ();
Eio_unix.sleep 0.5;
loop ()
)
in
loop
end in end in
let httpc =
(* Prime RNG state for TLS *)
Mirage_crypto_rng_unix.use_default ();
Httpc.create net
in
let module M = struct let module M = struct
let push_to_batch b e = let set_on_tick_callbacks = Atomic.set on_tick_cbs_
match Batch.push b e with
| `Ok -> ()
| `Dropped -> Atomic.incr n_errors
let push_trace e = let push_trace e = push_traces e
let@ () = guard_exn_ "push trace" in
push_to_batch batch_traces e;
let now = Mtime_clock.now () in
Fiber.fork ~sw (emit_traces_maybe ~now httpc)
let push_metrics e = let push_metrics e = push_metrics e
let@ () = guard_exn_ "push metrics" in
let push_logs e = push_logs e
let tick () =
if Config.Env.get_debug () then
Printf.eprintf "tick (from %d)\n%!" (tid ());
run_tick_callbacks ();
sample_gc_metrics_if_needed (); sample_gc_metrics_if_needed ();
push_to_batch batch_metrics e; emit_all ~force:false
let now = Mtime_clock.now () in
Fiber.fork ~sw (emit_metrics_maybe ~now httpc)
let push_logs e =
let@ () = guard_exn_ "push logs" in
push_to_batch batch_logs e;
let now = Mtime_clock.now () in
Fiber.fork ~sw (emit_logs_maybe ~now httpc)
let set_on_tick_callbacks = set_on_tick_callbacks
let tick_ () =
tick_common_ ();
sample_gc_metrics_if_needed ();
let now = Mtime_clock.now () in
Fiber.all
[
emit_logs_maybe ~now httpc;
emit_metrics_maybe ~now httpc;
emit_traces_maybe ~now httpc;
]
let () = Eio.Fiber.fork_daemon ~sw (ticker_fiber ~tick:tick_)
let tick () = Fiber.fork ~sw tick_
let cleanup ~on_done () = let cleanup ~on_done () =
if Config.Env.get_debug () then if Config.Env.get_debug () then
Printf.eprintf "opentelemetry: exiting…\n%!"; Printf.eprintf "opentelemetry: exiting…\n%!";
(* This must be in its own switch, because it MUST run even if the Atomic.set stop true;
surrounding switch in the environment has been cancelled. *) run_tick_callbacks ();
Switch.run @@ fun sw -> sample_gc_metrics_if_needed ();
Fiber.fork ~sw (fun () -> emit_all ~force:true;
emit_all_force httpc; on_done ()
on_done ())
end in end in
(module M : EMITTER) (module M : EMITTER)
module Backend (Emitter : EMITTER) : Opentelemetry.Collector.BACKEND = struct module Backend (Emitter : EMITTER) : Opentelemetry.Collector.BACKEND = struct
include Emitter
open Opentelemetry.Proto open Opentelemetry.Proto
open Opentelemetry.Collector open Opentelemetry.Collector
open Emitter
let send_trace : Trace.resource_spans list sender = let send_trace : Trace.resource_spans list sender =
{ {
@ -470,30 +435,46 @@ module Backend (Emitter : EMITTER) : Opentelemetry.Collector.BACKEND = struct
push_logs m; push_logs m;
ret ()); ret ());
} }
let tick = Emitter.tick
let cleanup = Emitter.cleanup
let set_on_tick_callbacks = Emitter.set_on_tick_callbacks
end end
let create_backend ~sw ?(stop = Atomic.make false) ?(config = Config.make ()) let create_backend ~sw ?(stop = Atomic.make false) ?(config = Config.make ())
(env : Eio_unix.Stdenv.base) : (module OT.Collector.BACKEND) = env : (module OT.Collector.BACKEND) =
let module E = (val mk_emitter ~sw ~stop ~config ~net:env#net ()) in let module E = (val mk_emitter ~stop ~net:env#net config) in
(module Backend (E)) let module B = Backend (E) in
(* Run a background fiber to keep the backend ticking regularly.
NOTE: This cannot be located inside the [Backend], because switches
are not thread safe, and cannot be used accross domains, but the
backend is accessed across domains. *)
Eio.Fiber.fork ~sw (fun () ->
while not @@ Atomic.get stop do
Eio.Time.sleep env#clock 0.5;
B.tick ()
done);
(module B)
let setup_ ~sw ?stop ?config env : unit = let setup_ ~sw ?stop ?config env : unit =
let backend = create_backend ~sw ?stop ?config env in let backend = create_backend ?stop ?config ~sw env in
OT.Collector.set_backend backend; OT.Collector.set_backend backend
()
let setup ?stop ?config ?(enable = true) env = let setup ?stop ?config ?(enable = true) ~sw env =
if enable then Switch.run @@ fun sw -> setup_ ~sw ?stop ?config env if enable then setup_ ~sw ?stop ?config env
let remove_backend () = OT.Collector.remove_backend ~on_done:ignore () let remove_backend () = OT.Collector.remove_backend ~on_done:ignore ()
let with_setup ?stop ?(config = Config.make ()) ?(enable = true) f env = let with_setup ?stop ?config ?(enable = true) f env =
(* NOTE: We must thread the switch [sw] through to all the forked threads in if enable then
the Backend's Emitter, to ensure that we can wait on all of them to Switch.run @@ fun sw ->
complete before before removing the backend during cleanup. *) snd
Switch.run (fun sw -> @@ Fiber.pair
if enable then ( (fun () -> setup_ ~sw ?stop ?config env)
setup_ ~sw ?stop ~config env; (fun () -> Fun.protect ~finally:(fun () -> remove_backend ()) f)
Switch.on_release sw remove_backend else
); f ()
f env)

View file

@ -24,6 +24,7 @@ val setup :
?stop:bool Atomic.t -> ?stop:bool Atomic.t ->
?config:Config.t -> ?config:Config.t ->
?enable:bool -> ?enable:bool ->
sw:Eio.Switch.t ->
Eio_unix.Stdenv.base -> Eio_unix.Stdenv.base ->
unit unit
(** Setup endpoint. This modifies {!Opentelemetry.Collector.backend}. (** Setup endpoint. This modifies {!Opentelemetry.Collector.backend}.
@ -43,7 +44,7 @@ val with_setup :
?stop:bool Atomic.t -> ?stop:bool Atomic.t ->
?config:Config.t -> ?config:Config.t ->
?enable:bool -> ?enable:bool ->
(Eio_unix.Stdenv.base -> 'a) -> (unit -> 'a) ->
Eio_unix.Stdenv.base -> Eio_unix.Stdenv.base ->
'a 'a
(** [with_setup () f] is like [setup(); f()] but takes care of cleaning up after (** [with_setup () f] is like [setup(); f()] but takes care of cleaning up after