From 7cdadfaeeb272ae8c48b83ef5d0ce849026da409 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Fri, 25 Jul 2025 10:18:38 -0400 Subject: [PATCH 1/4] Fix exception message --- src/client-cohttp-eio/opentelemetry_client_cohttp_eio.ml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/client-cohttp-eio/opentelemetry_client_cohttp_eio.ml b/src/client-cohttp-eio/opentelemetry_client_cohttp_eio.ml index cb21a374..3c5f6906 100644 --- a/src/client-cohttp-eio/opentelemetry_client_cohttp_eio.ml +++ b/src/client-cohttp-eio/opentelemetry_client_cohttp_eio.ml @@ -284,9 +284,8 @@ let mk_emitter ~sw ~stop ~(config : Config.t) ~(net : _ Eio.Net.t) () : try f () with e -> let bt = Printexc.get_backtrace () in - Printf.eprintf - "opentelemetry-curl: uncaught exception in %s: %s\n%s\n%!" where - (Printexc.to_string e) bt + Printf.eprintf "opentelemetry-eio: uncaught exception in %s: %s\n%s\n%!" + where (Printexc.to_string e) bt let emit_all_force (httpc : Httpc.t) : unit = let now = Mtime_clock.now () in From ddbdc80d57960011848ba7444e9c808b2454eeb7 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Tue, 29 Jul 2025 23:37:26 -0400 Subject: [PATCH 2/4] make Eio collector thread safe The backend cannot take a switch, because switches cannot be shared across domains, but the backend is accessed across domains from a global variable. --- .../opentelemetry_client_cohttp_eio.ml | 249 ++++++++---------- .../opentelemetry_client_cohttp_eio.mli | 3 +- 2 files changed, 117 insertions(+), 135 deletions(-) diff --git a/src/client-cohttp-eio/opentelemetry_client_cohttp_eio.ml b/src/client-cohttp-eio/opentelemetry_client_cohttp_eio.ml index 3c5f6906..b6266ad9 100644 --- a/src/client-cohttp-eio/opentelemetry_client_cohttp_eio.ml +++ b/src/client-cohttp-eio/opentelemetry_client_cohttp_eio.ml @@ -136,7 +136,7 @@ end = struct let create net = Httpc.make ~https:(Some (https ~authenticator)) net (* send the content to the remote endpoint/path *) - let send (client : t) ~url ~decode (bod : string) : ('a, error) result = + let send (client : t) ~url ~decode (body : string) : ('a, error) result = Switch.run @@ fun sw -> let uri = Uri.of_string url in @@ -146,7 +146,7 @@ end = struct Header.(add headers "Content-Type" "application/x-protobuf") in - let body = Cohttp_eio.Body.of_string bod in + let body = Cohttp_eio.Body.of_string body in let r = try let r = Httpc.post client ~sw ~headers ~body uri in @@ -223,32 +223,16 @@ end exceptions inside should be caught, see https://opentelemetry.io/docs/reference/specification/error-handling/ *) -let mk_emitter ~sw ~stop ~(config : Config.t) ~(net : _ Eio.Net.t) () : - (module EMITTER) = - let open Proto in +let mk_emitter ~stop ~net (config : Config.t) : (module EMITTER) = (* local helpers *) let open struct - let timeout = - if config.batch_timeout_ms > 0 then - Some Mtime.Span.(config.batch_timeout_ms * ms) - else - None + let client = + (* Prime RNG state for TLS *) + Mirage_crypto_rng_unix.use_default (); + Httpc.create net - let batch_traces : Trace.resource_spans Batch.t = - Batch.make ?batch:config.batch_traces ?timeout () - - let batch_metrics : Metrics.resource_metrics Batch.t = - Batch.make ?batch:config.batch_metrics ?timeout () - - let batch_logs : Logs.resource_logs Batch.t = - Batch.make ?batch:config.batch_logs ?timeout () - - let on_tick_cbs_ = Atomic.make (AList.make ()) - - let set_on_tick_callbacks = Atomic.set on_tick_cbs_ - - let send_http_ (httpc : Httpc.t) ~url data : unit = - let r = Httpc.send httpc ~url ~decode:(`Ret ()) data in + let send_http ~url data : unit = + let r = Httpc.send client ~url ~decode:(`Ret ()) data in match r with | Ok () -> () | Error `Sysbreak -> @@ -261,24 +245,25 @@ let mk_emitter ~sw ~stop ~(config : Config.t) ~(net : _ Eio.Net.t) () : (* avoid crazy error loop *) Eio_unix.sleep 3. - (* emit metrics, if the batch is full or timeout lapsed *) - let emit_metrics_maybe ~now ?force client () = - Batch.pop_if_ready ?force ~now batch_metrics - |> Option.iter (fun collected_metrics -> - let gc_metrics = GC_metrics.drain () in - gc_metrics @ collected_metrics - |> Signal.Encode.metrics - |> send_http_ client ~url:config.url_metrics) + let timeout = + if config.batch_timeout_ms > 0 then + Some Mtime.Span.(config.batch_timeout_ms * ms) + else + None - let emit_traces_maybe ~now ?force client () = - Batch.pop_if_ready ?force ~now batch_traces - |> Option.iter (fun ts -> - Signal.Encode.traces ts |> send_http_ client ~url:config.url_traces) + let batch_traces : Proto.Trace.resource_spans Batch.t = + Batch.make ?batch:config.batch_traces ?timeout () - let emit_logs_maybe ~now ?force client () = - Batch.pop_if_ready ?force ~now batch_logs - |> Option.iter (fun ls -> - Signal.Encode.logs ls |> send_http_ client ~url:config.url_logs) + let batch_metrics : Proto.Metrics.resource_metrics Batch.t = + Batch.make ?batch:config.batch_metrics ?timeout () + + let batch_logs : Proto.Logs.resource_logs Batch.t = + Batch.make ?batch:config.batch_logs ?timeout () + + let push_to_batch b e = + match Batch.push b e with + | `Ok -> () + | `Dropped -> Atomic.incr n_errors let[@inline] guard_exn_ where f = try f () @@ -287,104 +272,84 @@ let mk_emitter ~sw ~stop ~(config : Config.t) ~(net : _ Eio.Net.t) () : Printf.eprintf "opentelemetry-eio: uncaught exception in %s: %s\n%s\n%!" where (Printexc.to_string e) bt - let emit_all_force (httpc : Httpc.t) : unit = - let now = Mtime_clock.now () in - Fiber.all - [ - emit_logs_maybe ~now ~force:true httpc; - emit_metrics_maybe ~now ~force:true httpc; - emit_traces_maybe ~now ~force:true httpc; - ] + let push_traces x = + let@ () = guard_exn_ "push trace" in + push_to_batch batch_traces x - let tick_common_ () = - if Config.Env.get_debug () then - Printf.eprintf "tick (from %d)\n%!" (tid ()); + let push_metrics x = + let@ () = guard_exn_ "push metrics" in sample_gc_metrics_if_needed (); + push_to_batch batch_metrics x + + let push_logs x = + let@ () = guard_exn_ "push logs" in + push_to_batch batch_logs x + + let maybe_emit (batch : 'a Batch.t) url (f : 'a list -> string) ~now ~force + () : unit = + Batch.pop_if_ready ~force ~now batch + |> Option.iter (fun signals -> f signals |> send_http ~url) + + let emit_traces_maybe = + maybe_emit batch_traces config.url_traces Signal.Encode.traces + + let emit_metrics_maybe = + maybe_emit batch_metrics config.url_metrics (fun collected_metrics -> + let gc_metrics = GC_metrics.drain () in + gc_metrics @ collected_metrics |> Signal.Encode.metrics) + + let emit_logs_maybe = + maybe_emit batch_logs config.url_logs Signal.Encode.logs + + let emit_all ~force : unit = + Switch.run @@ fun sw -> + let now = Mtime_clock.now () in + Fiber.fork ~sw @@ emit_logs_maybe ~now ~force; + Fiber.fork ~sw @@ emit_metrics_maybe ~now ~force; + Fiber.fork ~sw @@ emit_traces_maybe ~now ~force + + let on_tick_cbs_ = Atomic.make (AList.make ()) + + let run_tick_callbacks () = List.iter (fun f -> try f () with e -> Printf.eprintf "on tick callback raised: %s\n" (Printexc.to_string e)) - (AList.get @@ Atomic.get on_tick_cbs_); - () - - (* thread that calls [tick()] regularly, to help enforce timeouts *) - let ticker_fiber ~tick : unit -> [ `Stop_daemon ] = - let rec loop () = - if Atomic.get stop then - `Stop_daemon - else ( - tick (); - Eio_unix.sleep 0.5; - loop () - ) - in - loop + (AList.get @@ Atomic.get on_tick_cbs_) end in - let httpc = - (* Prime RNG state for TLS *) - Mirage_crypto_rng_unix.use_default (); - Httpc.create net - in let module M = struct - let push_to_batch b e = - match Batch.push b e with - | `Ok -> () - | `Dropped -> Atomic.incr n_errors + let set_on_tick_callbacks = Atomic.set on_tick_cbs_ - let push_trace e = - let@ () = guard_exn_ "push trace" in - push_to_batch batch_traces e; - let now = Mtime_clock.now () in - Fiber.fork ~sw (emit_traces_maybe ~now httpc) + let push_trace e = push_traces e - let push_metrics e = - let@ () = guard_exn_ "push metrics" in + let push_metrics e = push_metrics e + + let push_logs e = push_logs e + + let tick () = + if Config.Env.get_debug () then + Printf.eprintf "tick (from %d)\n%!" (tid ()); + run_tick_callbacks (); sample_gc_metrics_if_needed (); - push_to_batch batch_metrics e; - let now = Mtime_clock.now () in - Fiber.fork ~sw (emit_metrics_maybe ~now httpc) - - let push_logs e = - let@ () = guard_exn_ "push logs" in - push_to_batch batch_logs e; - let now = Mtime_clock.now () in - Fiber.fork ~sw (emit_logs_maybe ~now httpc) - - let set_on_tick_callbacks = set_on_tick_callbacks - - let tick_ () = - tick_common_ (); - sample_gc_metrics_if_needed (); - let now = Mtime_clock.now () in - Fiber.all - [ - emit_logs_maybe ~now httpc; - emit_metrics_maybe ~now httpc; - emit_traces_maybe ~now httpc; - ] - - let () = Eio.Fiber.fork_daemon ~sw (ticker_fiber ~tick:tick_) - - let tick () = Fiber.fork ~sw tick_ + emit_all ~force:false let cleanup ~on_done () = if Config.Env.get_debug () then Printf.eprintf "opentelemetry: exiting…\n%!"; - (* This must be in its own switch, because it MUST run even if the - surrounding switch in the environment has been cancelled. *) - Switch.run @@ fun sw -> - Fiber.fork ~sw (fun () -> - emit_all_force httpc; - on_done ()) + Atomic.set stop true; + run_tick_callbacks (); + sample_gc_metrics_if_needed (); + emit_all ~force:true; + on_done () end in (module M : EMITTER) module Backend (Emitter : EMITTER) : Opentelemetry.Collector.BACKEND = struct - include Emitter open Opentelemetry.Proto open Opentelemetry.Collector + open Emitter let send_trace : Trace.resource_spans list sender = { @@ -470,30 +435,46 @@ module Backend (Emitter : EMITTER) : Opentelemetry.Collector.BACKEND = struct push_logs m; ret ()); } + + let tick = Emitter.tick + + let cleanup = Emitter.cleanup + + let set_on_tick_callbacks = Emitter.set_on_tick_callbacks end let create_backend ~sw ?(stop = Atomic.make false) ?(config = Config.make ()) - (env : Eio_unix.Stdenv.base) : (module OT.Collector.BACKEND) = - let module E = (val mk_emitter ~sw ~stop ~config ~net:env#net ()) in - (module Backend (E)) + env : (module OT.Collector.BACKEND) = + let module E = (val mk_emitter ~stop ~net:env#net config) in + let module B = Backend (E) in + (* Run a background fiber to keep the backend ticking regularly. + + NOTE: This cannot be located inside the [Backend], because switches + are not thread safe, and cannot be used accross domains, but the + backend is accessed across domains. *) + Eio.Fiber.fork ~sw (fun () -> + while not @@ Atomic.get stop do + Eio.Time.sleep env#clock 0.5; + B.tick () + done); + + (module B) let setup_ ~sw ?stop ?config env : unit = - let backend = create_backend ~sw ?stop ?config env in - OT.Collector.set_backend backend; - () + let backend = create_backend ?stop ?config ~sw env in + OT.Collector.set_backend backend -let setup ?stop ?config ?(enable = true) env = - if enable then Switch.run @@ fun sw -> setup_ ~sw ?stop ?config env +let setup ?stop ?config ?(enable = true) ~sw env = + if enable then setup_ ~sw ?stop ?config env let remove_backend () = OT.Collector.remove_backend ~on_done:ignore () -let with_setup ?stop ?(config = Config.make ()) ?(enable = true) f env = - (* NOTE: We must thread the switch [sw] through to all the forked threads in - the Backend's Emitter, to ensure that we can wait on all of them to - complete before before removing the backend during cleanup. *) - Switch.run (fun sw -> - if enable then ( - setup_ ~sw ?stop ~config env; - Switch.on_release sw remove_backend - ); - f env) +let with_setup ?stop ?config ?(enable = true) f env = + if enable then + Switch.run @@ fun sw -> + snd + @@ Fiber.pair + (fun () -> setup_ ~sw ?stop ?config env) + (fun () -> Fun.protect ~finally:(fun () -> remove_backend ()) f) + else + f () diff --git a/src/client-cohttp-eio/opentelemetry_client_cohttp_eio.mli b/src/client-cohttp-eio/opentelemetry_client_cohttp_eio.mli index 9010e5be..40b26a8e 100644 --- a/src/client-cohttp-eio/opentelemetry_client_cohttp_eio.mli +++ b/src/client-cohttp-eio/opentelemetry_client_cohttp_eio.mli @@ -24,6 +24,7 @@ val setup : ?stop:bool Atomic.t -> ?config:Config.t -> ?enable:bool -> + sw:Eio.Switch.t -> Eio_unix.Stdenv.base -> unit (** Setup endpoint. This modifies {!Opentelemetry.Collector.backend}. @@ -43,7 +44,7 @@ val with_setup : ?stop:bool Atomic.t -> ?config:Config.t -> ?enable:bool -> - (Eio_unix.Stdenv.base -> 'a) -> + (unit -> 'a) -> Eio_unix.Stdenv.base -> 'a (** [with_setup () f] is like [setup(); f()] but takes care of cleaning up after From 0890a1a5cd7f59d3ba6e03bb7553d663fbde3ece Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Tue, 29 Jul 2025 23:41:16 -0400 Subject: [PATCH 3/4] Use multiple system threads in integration tests --- tests/bin/emit1_cohttp.ml | 7 +++ tests/bin/emit1_eio.ml | 32 ++++++++++---- tests/client_e2e/clients_e2e_lib.ml | 43 +++++++++++++------ tests/client_e2e/test_cottp_eio_client_e2e.ml | 12 ++++++ tests/client_e2e/test_cottp_lwt_client_e2e.ml | 2 + 5 files changed, 75 insertions(+), 21 deletions(-) diff --git a/tests/bin/emit1_cohttp.ml b/tests/bin/emit1_cohttp.ml index 0611c754..14f657cd 100644 --- a/tests/bin/emit1_cohttp.ml +++ b/tests/bin/emit1_cohttp.ml @@ -109,6 +109,7 @@ let () = let batch_metrics = ref 3 in let batch_logs = ref 400 in let url = ref None in + let n_procs = ref 1 in let opts = [ "--debug", Arg.Bool (( := ) debug), " enable debug output"; @@ -127,12 +128,18 @@ let () = "--sleep-outer", Arg.Set_float sleep_outer, " sleep (in s) in outer loop"; "--iterations", Arg.Set_int iterations, " the number of iterations to run"; "-j", Arg.Set_int n_jobs, " number of parallel jobs"; + "--procs", Arg.Set_int n_procs, " number of processes"; ] |> Arg.align in Arg.parse opts (fun _ -> ()) "emit1 [opt]*"; + if !n_procs > 1 then + failwith + "TODO: add support for running multiple processes to the lwt-cohttp \ + emitter"; + let some_if_nzero r = if !r > 0 then Some !r diff --git a/tests/bin/emit1_eio.ml b/tests/bin/emit1_eio.ml index dc5198ae..debb9e75 100644 --- a/tests/bin/emit1_eio.ml +++ b/tests/bin/emit1_eio.ml @@ -11,7 +11,7 @@ let sleep_outer = ref 2.0 let n_jobs = ref 1 -let iterations = ref 1 +let iterations = Atomic.make 1 let num_sleep = Atomic.make 0 @@ -32,8 +32,8 @@ let run_job clock _job_id : unit = ~attrs:[ "i", `Int !i ] in - for j = 0 to !iterations do - if j >= !iterations then + for j = 0 to Atomic.get iterations do + if j >= Atomic.get iterations then (* Terminate program, having reached our max iterations *) Atomic.set stop true else @@ -80,7 +80,7 @@ let run_job clock _job_id : unit = done done -let run env : unit = +let run env proc () : unit = OT.GC_metrics.basic_setup (); OT.Metrics_callbacks.register (fun () -> @@ -91,7 +91,7 @@ let run env : unit = ]); let n_jobs = max 1 !n_jobs in - Printf.printf "run %d jobs\n%!" n_jobs; + Printf.printf "run %d jobs in proc %d\n%!" n_jobs proc; Eio.Switch.run (fun sw -> for j = 1 to n_jobs do @@ -109,6 +109,7 @@ let () = let batch_metrics = ref 3 in let batch_logs = ref 400 in let url = ref None in + let n_procs = ref 1 in let opts = [ "--debug", Arg.Bool (( := ) debug), " enable debug output"; @@ -125,8 +126,11 @@ let () = "--batch-logs", Arg.Int (( := ) batch_logs), " size of logs batch"; "--sleep-inner", Arg.Set_float sleep_inner, " sleep (in s) in inner loop"; "--sleep-outer", Arg.Set_float sleep_outer, " sleep (in s) in outer loop"; - "--iterations", Arg.Set_int iterations, " the number of iterations to run"; - "-j", Arg.Set_int n_jobs, " number of parallel jobs"; + ( "--iterations", + Arg.Int (Atomic.set iterations), + " the number of iterations to run" ); + "-j", Arg.Set_int n_jobs, " number of jobs per processes"; + "--procs", Arg.Set_int n_procs, " number of processes"; ] |> Arg.align in @@ -155,4 +159,16 @@ let () = Printf.printf "\ndone. %d spans in %.4fs (%.4f/s)\n%!" (Atomic.get num_tr) elapsed n_per_sec) in - Opentelemetry_client_cohttp_eio.with_setup ~stop ~config run |> Eio_main.run + Eio_main.run @@ fun env -> + (if !n_procs < 2 then + Opentelemetry_client_cohttp_eio.with_setup ~stop ~config (run env 0) env + else + Eio.Switch.run @@ fun sw -> + Opentelemetry_client_cohttp_eio.setup ~stop ~config ~sw env; + let dm = Eio.Stdenv.domain_mgr env in + Eio.Switch.run (fun sw -> + for proc = 1 to !n_procs do + Eio.Fiber.fork ~sw @@ fun () -> + Eio.Domain_manager.run dm (run env proc) + done)); + Opentelemetry.Collector.remove_backend () ~on_done:ignore diff --git a/tests/client_e2e/clients_e2e_lib.ml b/tests/client_e2e/clients_e2e_lib.ml index 8c085a22..3206bfe9 100644 --- a/tests/client_e2e/clients_e2e_lib.ml +++ b/tests/client_e2e/clients_e2e_lib.ml @@ -56,6 +56,15 @@ let filter_map_metrics f signals = |> List.find_map (fun ss -> ss.Proto.Metrics.metrics |> List.find_map f)) +let count_metrics_with_name name signals = + signals + |> filter_map_metrics (fun s -> + if String.equal s.Proto.Metrics.name name then + Some s + else + None) + |> List.length + let number_data_point_to_float : Proto.Metrics.number_data_point_value -> float = function | Proto.Metrics.As_double f -> f @@ -98,6 +107,7 @@ let count_logs_with_body p signals = type params = { url: string; jobs: int; + procs: int; batch_traces: int; batch_metrics: int; batch_logs: int; @@ -109,6 +119,8 @@ let cmd exec params = exec; "-j"; string_of_int params.jobs; + "--procs"; + string_of_int params.procs; "--url"; params.url; "--iterations"; @@ -134,22 +146,24 @@ let tests params signal_batches = (* TODO: What properties of batch sizes does it make sense to test? *) test "loop.outer spans" (fun () -> Alcotest.(check' int) - ~msg:"number of occurrences should equal the configured jobs" - ~expected:params.jobs + ~msg: + "number of occurrences should equal the configured jobs * the \ + configured processes" + ~expected:(params.jobs * params.procs) ~actual:(count_spans_with_name "loop.outer" signals)); test "loop.inner spans" (fun () -> Alcotest.(check' int) ~msg: "number of occurrences should equal the configured jobs * the \ - configured iterations" - ~expected:(params.jobs * params.iterations) + configured iterations * configured processes" + ~expected:(params.jobs * params.iterations * params.procs) ~actual:(count_spans_with_name "loop.inner" signals)); test "alloc spans" (fun () -> Alcotest.(check' int) ~msg: "number of occurrences should equal the configured jobs * the \ - configured iterations" - ~expected:(params.jobs * params.iterations) + configured iterations * configured processes" + ~expected:(params.jobs * params.iterations * params.procs) ~actual:(count_spans_with_name "alloc" signals); Alcotest.(check' bool) ~msg:"should have 'done with alloc' event" ~expected:true @@ -167,16 +181,19 @@ let tests params signal_batches = |> List.for_all (fun (e : Proto.Trace.span_event) -> String.equal e.name "done with alloc"))); test "num-sleep metrics" (fun () -> - Alcotest.(check' (float 0.)) - ~msg:"should record jobs * iterations sleeps" - ~expected:(params.jobs * params.iterations |> float_of_int) + Alcotest.(check' bool) + ~msg: + "should record at lest as many sleep metrics as there are \ + iterations configured" + ~expected:true ~actual: - (get_metric_values "num-sleep" signals - |> List.sort Float.compare |> List.rev |> List.hd)); + (count_metrics_with_name "num-sleep" signals >= params.iterations)); test "logs" (fun () -> Alcotest.(check' int) - ~msg:"should record jobs * iterations occurrences of 'inner at n'" - ~expected:(params.jobs * params.iterations) + ~msg: + "should record jobs * iterations occurrences * configured \ + processes of 'inner at n'" + ~expected:(params.jobs * params.iterations * params.procs) ~actual: (signals |> count_logs_with_body (function diff --git a/tests/client_e2e/test_cottp_eio_client_e2e.ml b/tests/client_e2e/test_cottp_eio_client_e2e.ml index 539a1ba4..ab5cf985 100644 --- a/tests/client_e2e/test_cottp_eio_client_e2e.ml +++ b/tests/client_e2e/test_cottp_eio_client_e2e.ml @@ -15,6 +15,7 @@ let () = { url; jobs = 1; + procs = 1; iterations = 1; batch_traces = 2; batch_metrics = 2; @@ -24,6 +25,17 @@ let () = { url; jobs = 3; + procs = 1; + iterations = 1; + batch_traces = 400; + batch_metrics = 3; + batch_logs = 400; + } ); + ( "emit1_eio", + { + url; + jobs = 3; + procs = 3; iterations = 1; batch_traces = 400; batch_metrics = 3; diff --git a/tests/client_e2e/test_cottp_lwt_client_e2e.ml b/tests/client_e2e/test_cottp_lwt_client_e2e.ml index 5c72165e..b1ba3772 100644 --- a/tests/client_e2e/test_cottp_lwt_client_e2e.ml +++ b/tests/client_e2e/test_cottp_lwt_client_e2e.ml @@ -25,6 +25,7 @@ let () = { url; jobs = 1; + procs = 1; iterations = 1; batch_traces = 2; batch_metrics = 2; @@ -34,6 +35,7 @@ let () = { url; jobs = 3; + procs = 1; iterations = 1; batch_traces = 400; batch_metrics = 3; From 16de06aac5f6cbeba2c47d3383d88061808e8b2d Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Fri, 1 Aug 2025 14:10:02 -0400 Subject: [PATCH 4/4] Make emit1_eio.ml emit deterministic signals As soon as we start running this in multible system threads, the race to trigger the globals `stop` and `iterations` makes the signal emissions non-deterministic, which makes the test kind of meaningless. This change should make them determinstic. --- tests/bin/emit1_eio.ml | 106 ++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 54 deletions(-) diff --git a/tests/bin/emit1_eio.ml b/tests/bin/emit1_eio.ml index debb9e75..83a9481e 100644 --- a/tests/bin/emit1_eio.ml +++ b/tests/bin/emit1_eio.ml @@ -11,8 +11,6 @@ let sleep_outer = ref 2.0 let n_jobs = ref 1 -let iterations = Atomic.make 1 - let num_sleep = Atomic.make 0 let stress_alloc_ = ref true @@ -24,63 +22,61 @@ let num_tr = Atomic.make 0 (* Counter used to mark simulated failures *) let i = ref 0 -let run_job clock _job_id : unit = - while not @@ Atomic.get stop do - let@ scope = - Atomic.incr num_tr; - OT.Trace.with_ ~kind:OT.Span.Span_kind_producer "loop.outer" - ~attrs:[ "i", `Int !i ] - in +let run_job clock _job_id iterations : unit = + let@ scope = + Atomic.incr num_tr; + OT.Trace.with_ ~kind:OT.Span.Span_kind_producer "loop.outer" + ~attrs:[ "i", `Int !i ] + in - for j = 0 to Atomic.get iterations do - if j >= Atomic.get iterations then - (* Terminate program, having reached our max iterations *) - Atomic.set stop true - else - (* parent scope is found via thread local storage *) + for j = 0 to iterations do + if j >= iterations then + (* Terminate program, having reached our max iterations *) + Atomic.set stop true + else + (* parent scope is found via thread local storage *) + let@ scope = + Atomic.incr num_tr; + OT.Trace.with_ ~scope ~kind:OT.Span.Span_kind_internal + ~attrs:[ "j", `Int j ] + "loop.inner" + in + + let () = Eio.Time.sleep clock !sleep_outer in + Atomic.incr num_sleep; + + OT.Logs.( + emit + [ + make_strf ~trace_id:scope.trace_id ~span_id:scope.span_id + ~severity:Severity_number_info "inner at %d" j; + ]); + + incr i; + + try + Atomic.incr num_tr; let@ scope = - Atomic.incr num_tr; - OT.Trace.with_ ~scope ~kind:OT.Span.Span_kind_internal - ~attrs:[ "j", `Int j ] - "loop.inner" + OT.Trace.with_ ~kind:OT.Span.Span_kind_internal ~scope "alloc" in + (* allocate some stuff *) + if !stress_alloc_ then ( + let _arr = Sys.opaque_identity @@ Array.make (25 * 25551) 42.0 in + ignore _arr + ); - let () = Eio.Time.sleep clock !sleep_outer in + let () = Eio.Time.sleep clock !sleep_inner in Atomic.incr num_sleep; - OT.Logs.( - emit - [ - make_strf ~trace_id:scope.trace_id ~span_id:scope.span_id - ~severity:Severity_number_info "inner at %d" j; - ]); + if j = 4 && !i mod 13 = 0 then failwith "oh no"; - incr i; - - try - Atomic.incr num_tr; - let@ scope = - OT.Trace.with_ ~kind:OT.Span.Span_kind_internal ~scope "alloc" - in - (* allocate some stuff *) - if !stress_alloc_ then ( - let _arr = Sys.opaque_identity @@ Array.make (25 * 25551) 42.0 in - ignore _arr - ); - - let () = Eio.Time.sleep clock !sleep_inner in - Atomic.incr num_sleep; - - if j = 4 && !i mod 13 = 0 then failwith "oh no"; - - (* simulate a failure *) - Opentelemetry.Scope.add_event scope (fun () -> - OT.Event.make "done with alloc") - with Failure _ -> () - done + (* simulate a failure *) + Opentelemetry.Scope.add_event scope (fun () -> + OT.Event.make "done with alloc") + with Failure _ -> () done -let run env proc () : unit = +let run env proc iterations () : unit = OT.GC_metrics.basic_setup (); OT.Metrics_callbacks.register (fun () -> @@ -95,7 +91,7 @@ let run env proc () : unit = Eio.Switch.run (fun sw -> for j = 1 to n_jobs do - Eio.Fiber.fork ~sw (fun () -> run_job env#clock j) + Eio.Fiber.fork ~sw (fun () -> run_job env#clock j iterations) done) let () = @@ -109,6 +105,7 @@ let () = let batch_metrics = ref 3 in let batch_logs = ref 400 in let url = ref None in + let n_iterations = ref 1 in let n_procs = ref 1 in let opts = [ @@ -127,7 +124,7 @@ let () = "--sleep-inner", Arg.Set_float sleep_inner, " sleep (in s) in inner loop"; "--sleep-outer", Arg.Set_float sleep_outer, " sleep (in s) in outer loop"; ( "--iterations", - Arg.Int (Atomic.set iterations), + Arg.Set_int n_iterations, " the number of iterations to run" ); "-j", Arg.Set_int n_jobs, " number of jobs per processes"; "--procs", Arg.Set_int n_procs, " number of processes"; @@ -161,7 +158,8 @@ let () = in Eio_main.run @@ fun env -> (if !n_procs < 2 then - Opentelemetry_client_cohttp_eio.with_setup ~stop ~config (run env 0) env + Opentelemetry_client_cohttp_eio.with_setup ~stop ~config + (run env 0 !n_iterations) env else Eio.Switch.run @@ fun sw -> Opentelemetry_client_cohttp_eio.setup ~stop ~config ~sw env; @@ -169,6 +167,6 @@ let () = Eio.Switch.run (fun sw -> for proc = 1 to !n_procs do Eio.Fiber.fork ~sw @@ fun () -> - Eio.Domain_manager.run dm (run env proc) + Eio.Domain_manager.run dm (run env proc !n_iterations) done)); Opentelemetry.Collector.remove_backend () ~on_done:ignore