From f1f379d2e1b76b49bccd168fc03e4f922dce47f9 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Tue, 3 Mar 2026 17:15:53 -0500 Subject: [PATCH] client: use self-debug and new provider config --- src/client/export_error.ml | 30 ++++---- src/client/exporter_config.ml | 123 ++++++++++++++++++++------------- src/client/exporter_config.mli | 50 ++++++-------- src/client/generic_consumer.ml | 14 ++-- 4 files changed, 121 insertions(+), 96 deletions(-) diff --git a/src/client/export_error.ml b/src/client/export_error.ml index 4ed81c92..a82954da 100644 --- a/src/client/export_error.ml +++ b/src/client/export_error.ml @@ -11,9 +11,12 @@ let str_to_hex (s : string) : string = (** Report the error on stderr. *) let report_err : t -> unit = function - | `Sysbreak -> Printf.eprintf "opentelemetry: ctrl-c captured, stopping\n%!" + | `Sysbreak -> + Opentelemetry.Self_debug.log Opentelemetry.Self_debug.Info (fun () -> + "opentelemetry: ctrl-c captured, stopping") | `Failure msg -> - Format.eprintf "@[<2>opentelemetry: export failed: %s@]@." msg + Opentelemetry.Self_debug.log Opentelemetry.Self_debug.Error (fun () -> + Printf.sprintf "opentelemetry: export failed: %s" msg) | `Status ( code, { @@ -22,17 +25,18 @@ let report_err : t -> unit = function details; _presence = _; } ) -> - let pp_details out l = - List.iter - (fun s -> Format.fprintf out "%S;@ " (Bytes.unsafe_to_string s)) - l - in - Format.eprintf - "@[<2>opentelemetry: export failed with@ http code=%d@ status \ - {@[code=%ld;@ message=%S;@ details=[@[%a@]]@]}@]@." - code scode - (Bytes.unsafe_to_string message) - pp_details details + Opentelemetry.Self_debug.log Opentelemetry.Self_debug.Error (fun () -> + let pp_details out l = + List.iter + (fun s -> Format.fprintf out "%S;@ " (Bytes.unsafe_to_string s)) + l + in + Format.asprintf + "@[<2>opentelemetry: export failed with@ http code=%d@ status \ + {@[code=%ld;@ message=%S;@ details=[@[%a@]]@]}@]" + code scode + (Bytes.unsafe_to_string message) + pp_details details) let decode_invalid_http_response ~code ~url (body : string) : t = try diff --git a/src/client/exporter_config.ml b/src/client/exporter_config.ml index 93c3a89f..39f7ae3c 100644 --- a/src/client/exporter_config.ml +++ b/src/client/exporter_config.ml @@ -2,12 +2,7 @@ type protocol = | Http_protobuf | Http_json -type log_level = - | Log_level_none - | Log_level_error - | Log_level_warn - | Log_level_info - | Log_level_debug +type log_level = Opentelemetry.Self_debug.level option type rest = unit @@ -27,10 +22,9 @@ type t = { timeout_traces_ms: int; timeout_metrics_ms: int; timeout_logs_ms: int; - batch_traces: int option; - batch_metrics: int option; - batch_logs: int option; - batch_timeout_ms: int; + traces: Opentelemetry.Provider_config.t; + metrics: Opentelemetry.Provider_config.t; + logs: Opentelemetry.Provider_config.t; self_trace: bool; http_concurrency_level: int option; retry_max_attempts: int; @@ -56,11 +50,13 @@ open struct | Http_json -> Format.fprintf out "http/json" let pp_log_level out = function - | Log_level_none -> Format.fprintf out "none" - | Log_level_error -> Format.fprintf out "error" - | Log_level_warn -> Format.fprintf out "warn" - | Log_level_info -> Format.fprintf out "info" - | Log_level_debug -> Format.fprintf out "debug" + | None -> Format.fprintf out "none" + | Some level -> + Format.fprintf out "%s" (Opentelemetry.Self_debug.string_of_level level) + + let pp_provider_config out (c : Opentelemetry.Provider_config.t) = + Format.fprintf out "{batch=%a;@ timeout=%a}" ppiopt c.batch Mtime.Span.pp + c.timeout end let pp out (self : t) : unit = @@ -81,10 +77,9 @@ let pp out (self : t) : unit = timeout_traces_ms; timeout_metrics_ms; timeout_logs_ms; - batch_traces; - batch_metrics; - batch_logs; - batch_timeout_ms; + traces; + metrics; + logs; http_concurrency_level; retry_max_attempts; retry_initial_delay_ms; @@ -101,16 +96,15 @@ let pp out (self : t) : unit = %a@];@ @[<2>headers_metrics=@,\ %a@];@ @[<2>headers_logs=@,\ %a@];@ protocol=%a;@ timeout_ms=%d;@ timeout_traces_ms=%d;@ \ - timeout_metrics_ms=%d;@ timeout_logs_ms=%d;@ batch_traces=%a;@ \ - batch_metrics=%a;@ batch_logs=%a;@ batch_timeout_ms=%d;@ \ - http_concurrency_level=%a;@ retry_max_attempts=%d;@ \ + timeout_metrics_ms=%d;@ timeout_logs_ms=%d;@ traces=%a;@ metrics=%a;@ \ + logs=%a;@ http_concurrency_level=%a;@ retry_max_attempts=%d;@ \ retry_initial_delay_ms=%.0f;@ retry_max_delay_ms=%.0f;@ \ retry_backoff_multiplier=%.1f @]}" debug pp_log_level log_level sdk_disabled self_trace url_traces url_metrics url_logs ppheaders headers ppheaders headers_traces ppheaders headers_metrics ppheaders headers_logs pp_protocol protocol timeout_ms - timeout_traces_ms timeout_metrics_ms timeout_logs_ms ppiopt batch_traces - ppiopt batch_metrics ppiopt batch_logs batch_timeout_ms ppiopt + timeout_traces_ms timeout_metrics_ms timeout_logs_ms pp_provider_config + traces pp_provider_config metrics pp_provider_config logs ppiopt http_concurrency_level retry_max_attempts retry_initial_delay_ms retry_max_delay_ms retry_backoff_multiplier @@ -124,9 +118,13 @@ type 'k make = ?url_traces:string -> ?url_metrics:string -> ?url_logs:string -> - ?batch_traces:int option -> - ?batch_metrics:int option -> - ?batch_logs:int option -> + ?batch_traces:int -> + ?batch_metrics:int -> + ?batch_logs:int -> + ?batch_timeout_ms:int -> + ?traces:Opentelemetry.Provider_config.t -> + ?metrics:Opentelemetry.Provider_config.t -> + ?logs:Opentelemetry.Provider_config.t -> ?headers:(string * string) list -> ?headers_traces:(string * string) list -> ?headers_metrics:(string * string) list -> @@ -136,7 +134,6 @@ type 'k make = ?timeout_traces_ms:int -> ?timeout_metrics_ms:int -> ?timeout_logs_ms:int -> - ?batch_timeout_ms:int -> ?self_trace:bool -> ?http_concurrency_level:int -> ?retry_max_attempts:int -> @@ -155,22 +152,22 @@ open struct | Some ("1" | "true") -> true | _ -> false - let get_log_level_from_env () = + let get_log_level_from_env () : log_level = match Sys.getenv_opt "OTEL_LOG_LEVEL" with - | Some "none" -> Log_level_none - | Some "error" -> Log_level_error - | Some "warn" -> Log_level_warn - | Some "info" -> Log_level_info - | Some "debug" -> Log_level_debug + | Some "none" -> None + | Some "error" -> Some Error + | Some "warn" -> Some Warning + | Some "info" -> Some Info + | Some "debug" -> Some Debug | Some s -> - Printf.eprintf "warning: unknown log level %S, defaulting to info\n%!" s; - (* log in info level, so we at least don't miss warnings and errors *) - Log_level_info + Opentelemetry.Self_debug.log Warning (fun () -> + Printf.sprintf "unknown log level %S, defaulting to info" s); + Some Info | None -> if get_debug_from_env () then - Log_level_debug + Some Debug else - Log_level_none + Some Info let get_sdk_disabled_from_env () = match Sys.getenv_opt "OTEL_SDK_DISABLED" with @@ -241,16 +238,47 @@ module Env () : ENV = struct let make k ?(debug = get_debug_from_env ()) ?(log_level = get_log_level_from_env ()) ?(sdk_disabled = get_sdk_disabled_from_env ()) ?url ?url_traces - ?url_metrics ?url_logs ?(batch_traces = Some 400) - ?(batch_metrics = Some 200) ?(batch_logs = Some 400) + ?url_metrics ?url_logs ?batch_traces ?batch_metrics ?batch_logs + ?(batch_timeout_ms = 2_000) ?traces ?metrics ?logs ?(headers = get_general_headers_from_env ()) ?headers_traces ?headers_metrics ?headers_logs ?(protocol = get_protocol_from_env "OTEL_EXPORTER_OTLP_PROTOCOL") ?(timeout_ms = get_timeout_from_env "OTEL_EXPORTER_OTLP_TIMEOUT" 10_000) ?timeout_traces_ms ?timeout_metrics_ms ?timeout_logs_ms - ?(batch_timeout_ms = 2_000) ?(self_trace = false) ?http_concurrency_level - ?(retry_max_attempts = 3) ?(retry_initial_delay_ms = 100.) - ?(retry_max_delay_ms = 5000.) ?(retry_backoff_multiplier = 2.0) = + ?(self_trace = false) ?http_concurrency_level ?(retry_max_attempts = 3) + ?(retry_initial_delay_ms = 100.) ?(retry_max_delay_ms = 5000.) + ?(retry_backoff_multiplier = 2.0) = + let batch_timeout_ = Mtime.Span.(batch_timeout_ms * ms) in + let traces = + match traces with + | Some t -> t + | None -> + let batch = + match batch_traces with + | Some b -> b + | None -> get_timeout_from_env "OTEL_BSP_MAX_EXPORT_BATCH_SIZE" 400 + in + Opentelemetry.Provider_config.make ~batch ~timeout:batch_timeout_ () + in + let metrics = + match metrics with + | Some m -> m + | None -> + let batch = + match batch_metrics with + | Some b -> b + | None -> get_timeout_from_env "OTEL_METRIC_EXPORT_INTERVAL" 200 + in + Opentelemetry.Provider_config.make ~batch ~timeout:batch_timeout_ () + in + let logs = + match logs with + | Some l -> l + | None -> + let batch = Option.value batch_logs ~default:400 in + Opentelemetry.Provider_config.make ~batch ~timeout:batch_timeout_ () + in + let url_traces, url_metrics, url_logs = let base_url = let base_url = @@ -343,10 +371,9 @@ module Env () : ENV = struct timeout_traces_ms; timeout_metrics_ms; timeout_logs_ms; - batch_traces; - batch_metrics; - batch_logs; - batch_timeout_ms; + traces; + metrics; + logs; self_trace; http_concurrency_level; retry_max_attempts; diff --git a/src/client/exporter_config.mli b/src/client/exporter_config.mli index ef09ce05..68f4ed27 100644 --- a/src/client/exporter_config.mli +++ b/src/client/exporter_config.mli @@ -8,12 +8,9 @@ type protocol = | Http_protobuf | Http_json -type log_level = - | Log_level_none - | Log_level_error - | Log_level_warn - | Log_level_info - | Log_level_debug +type log_level = Opentelemetry.Self_debug.level option +(** [None] disables internal diagnostic logging; [Some level] enables it at that + level and above. Maps to [OTEL_LOG_LEVEL] env var. *) type rest (** opaque type to force using {!make} while allowing record updates *) @@ -60,25 +57,17 @@ type t = { timeout_logs_ms: int; (** Timeout for log exports. Read from OTEL_EXPORTER_OTLP_LOGS_TIMEOUT, falls back to timeout_ms. *) - batch_traces: int option; - (** Batch traces? If [Some i], then this produces batches of (at most) [i] - items. If [None], there is no batching. - - Note that traces and metrics are batched separately. Default - [Some 400]. *) - batch_metrics: int option; - (** Batch metrics? If [Some i], then this produces batches of (at most) - [i] items. If [None], there is no batching. - - Note that traces and metrics are batched separately. Default - [Some 200]. *) - batch_logs: int option; - (** Batch logs? See {!batch_metrics} for details. Default [Some 400] *) - batch_timeout_ms: int; - (** Number of milliseconds after which we will emit a batch, even - incomplete. Note that the batch might take longer than that, because - this is only checked when a new event occurs or when a tick is - emitted. Default 2_000. *) + traces: Opentelemetry.Provider_config.t; + (** Per-provider batching config for traces. Default: batch=400, + timeout=2s. The batch size is read from OTEL_BSP_MAX_EXPORT_BATCH_SIZE + if set. *) + metrics: Opentelemetry.Provider_config.t; + (** Per-provider batching config for metrics. Default: batch=200, + timeout=2s. The batch size is read from OTEL_METRIC_EXPORT_INTERVAL if + set. *) + logs: Opentelemetry.Provider_config.t; + (** Per-provider batching config for logs. Default: batch=400, timeout=2s. + *) self_trace: bool; (** If true, the OTEL library will perform some self-instrumentation. Default [false]. @@ -117,9 +106,13 @@ type 'k make = ?url_traces:string -> ?url_metrics:string -> ?url_logs:string -> - ?batch_traces:int option -> - ?batch_metrics:int option -> - ?batch_logs:int option -> + ?batch_traces:int -> + ?batch_metrics:int -> + ?batch_logs:int -> + ?batch_timeout_ms:int -> + ?traces:Opentelemetry.Provider_config.t -> + ?metrics:Opentelemetry.Provider_config.t -> + ?logs:Opentelemetry.Provider_config.t -> ?headers:(string * string) list -> ?headers_traces:(string * string) list -> ?headers_metrics:(string * string) list -> @@ -129,7 +122,6 @@ type 'k make = ?timeout_traces_ms:int -> ?timeout_metrics_ms:int -> ?timeout_logs_ms:int -> - ?batch_timeout_ms:int -> ?self_trace:bool -> ?http_concurrency_level:int -> ?retry_max_attempts:int -> diff --git a/src/client/generic_consumer.ml b/src/client/generic_consumer.ml index f83e1c42..81f12247 100644 --- a/src/client/generic_consumer.ml +++ b/src/client/generic_consumer.ml @@ -101,11 +101,11 @@ end = struct (* sanity check about the queue, which should be drained *) let size_q = Bounded_queue.Recv.size self.q in if size_q > 0 then - Printf.eprintf - "otel: warning: workers exited but work queue still contains %d \ - elements\n\ - %!" - size_q + OTEL.Self_debug.log OTEL.Self_debug.Warning (fun () -> + Printf.sprintf + "otel: warning: workers exited but work queue still contains %d \ + elements" + size_q) ) let send_signals (self : state) (sender : Sender.t) ~backoff @@ -122,7 +122,8 @@ end = struct Util_net_backoff.on_success backoff; IO.return () | Error `Sysbreak -> - Printf.eprintf "ctrl-c captured, stopping\n%!"; + OTEL.Self_debug.log OTEL.Self_debug.Info (fun () -> + "ctrl-c captured, stopping"); shutdown self; IO.return () | Error err -> @@ -135,6 +136,7 @@ end = struct let start_worker (self : state) : unit = let sender = Sender.create ~config:self.sender_config () in let backoff = Util_net_backoff.create () in + OTEL.Self_debug.log OTEL.Self_debug.Debug (fun () -> "otel worker started"); (* loop on [q] *) let rec loop () : unit IO.t =