client: use self-debug and new provider config

This commit is contained in:
Simon Cruanes 2026-03-03 17:15:53 -05:00
parent df643c9af6
commit f1f379d2e1
No known key found for this signature in database
GPG key ID: EBFFF6F283F3A2B4
4 changed files with 121 additions and 96 deletions

View file

@ -11,9 +11,12 @@ let str_to_hex (s : string) : string =
(** Report the error on stderr. *) (** Report the error on stderr. *)
let report_err : t -> unit = function let report_err : t -> unit = function
| `Sysbreak -> Printf.eprintf "opentelemetry: ctrl-c captured, stopping\n%!" | `Sysbreak ->
Opentelemetry.Self_debug.log Opentelemetry.Self_debug.Info (fun () ->
"opentelemetry: ctrl-c captured, stopping")
| `Failure msg -> | `Failure msg ->
Format.eprintf "@[<2>opentelemetry: export failed: %s@]@." msg Opentelemetry.Self_debug.log Opentelemetry.Self_debug.Error (fun () ->
Printf.sprintf "opentelemetry: export failed: %s" msg)
| `Status | `Status
( code, ( code,
{ {
@ -22,17 +25,18 @@ let report_err : t -> unit = function
details; details;
_presence = _; _presence = _;
} ) -> } ) ->
let pp_details out l = Opentelemetry.Self_debug.log Opentelemetry.Self_debug.Error (fun () ->
List.iter let pp_details out l =
(fun s -> Format.fprintf out "%S;@ " (Bytes.unsafe_to_string s)) List.iter
l (fun s -> Format.fprintf out "%S;@ " (Bytes.unsafe_to_string s))
in l
Format.eprintf in
"@[<2>opentelemetry: export failed with@ http code=%d@ status \ Format.asprintf
{@[code=%ld;@ message=%S;@ details=[@[%a@]]@]}@]@." "@[<2>opentelemetry: export failed with@ http code=%d@ status \
code scode {@[code=%ld;@ message=%S;@ details=[@[%a@]]@]}@]"
(Bytes.unsafe_to_string message) code scode
pp_details details (Bytes.unsafe_to_string message)
pp_details details)
let decode_invalid_http_response ~code ~url (body : string) : t = let decode_invalid_http_response ~code ~url (body : string) : t =
try try

View file

@ -2,12 +2,7 @@ type protocol =
| Http_protobuf | Http_protobuf
| Http_json | Http_json
type log_level = type log_level = Opentelemetry.Self_debug.level option
| Log_level_none
| Log_level_error
| Log_level_warn
| Log_level_info
| Log_level_debug
type rest = unit type rest = unit
@ -27,10 +22,9 @@ type t = {
timeout_traces_ms: int; timeout_traces_ms: int;
timeout_metrics_ms: int; timeout_metrics_ms: int;
timeout_logs_ms: int; timeout_logs_ms: int;
batch_traces: int option; traces: Opentelemetry.Provider_config.t;
batch_metrics: int option; metrics: Opentelemetry.Provider_config.t;
batch_logs: int option; logs: Opentelemetry.Provider_config.t;
batch_timeout_ms: int;
self_trace: bool; self_trace: bool;
http_concurrency_level: int option; http_concurrency_level: int option;
retry_max_attempts: int; retry_max_attempts: int;
@ -56,11 +50,13 @@ open struct
| Http_json -> Format.fprintf out "http/json" | Http_json -> Format.fprintf out "http/json"
let pp_log_level out = function let pp_log_level out = function
| Log_level_none -> Format.fprintf out "none" | None -> Format.fprintf out "none"
| Log_level_error -> Format.fprintf out "error" | Some level ->
| Log_level_warn -> Format.fprintf out "warn" Format.fprintf out "%s" (Opentelemetry.Self_debug.string_of_level level)
| Log_level_info -> Format.fprintf out "info"
| Log_level_debug -> Format.fprintf out "debug" let pp_provider_config out (c : Opentelemetry.Provider_config.t) =
Format.fprintf out "{batch=%a;@ timeout=%a}" ppiopt c.batch Mtime.Span.pp
c.timeout
end end
let pp out (self : t) : unit = let pp out (self : t) : unit =
@ -81,10 +77,9 @@ let pp out (self : t) : unit =
timeout_traces_ms; timeout_traces_ms;
timeout_metrics_ms; timeout_metrics_ms;
timeout_logs_ms; timeout_logs_ms;
batch_traces; traces;
batch_metrics; metrics;
batch_logs; logs;
batch_timeout_ms;
http_concurrency_level; http_concurrency_level;
retry_max_attempts; retry_max_attempts;
retry_initial_delay_ms; retry_initial_delay_ms;
@ -101,16 +96,15 @@ let pp out (self : t) : unit =
%a@];@ @[<2>headers_metrics=@,\ %a@];@ @[<2>headers_metrics=@,\
%a@];@ @[<2>headers_logs=@,\ %a@];@ @[<2>headers_logs=@,\
%a@];@ protocol=%a;@ timeout_ms=%d;@ timeout_traces_ms=%d;@ \ %a@];@ protocol=%a;@ timeout_ms=%d;@ timeout_traces_ms=%d;@ \
timeout_metrics_ms=%d;@ timeout_logs_ms=%d;@ batch_traces=%a;@ \ timeout_metrics_ms=%d;@ timeout_logs_ms=%d;@ traces=%a;@ metrics=%a;@ \
batch_metrics=%a;@ batch_logs=%a;@ batch_timeout_ms=%d;@ \ logs=%a;@ http_concurrency_level=%a;@ retry_max_attempts=%d;@ \
http_concurrency_level=%a;@ retry_max_attempts=%d;@ \
retry_initial_delay_ms=%.0f;@ retry_max_delay_ms=%.0f;@ \ retry_initial_delay_ms=%.0f;@ retry_max_delay_ms=%.0f;@ \
retry_backoff_multiplier=%.1f @]}" retry_backoff_multiplier=%.1f @]}"
debug pp_log_level log_level sdk_disabled self_trace url_traces url_metrics debug pp_log_level log_level sdk_disabled self_trace url_traces url_metrics
url_logs ppheaders headers ppheaders headers_traces ppheaders url_logs ppheaders headers ppheaders headers_traces ppheaders
headers_metrics ppheaders headers_logs pp_protocol protocol timeout_ms headers_metrics ppheaders headers_logs pp_protocol protocol timeout_ms
timeout_traces_ms timeout_metrics_ms timeout_logs_ms ppiopt batch_traces timeout_traces_ms timeout_metrics_ms timeout_logs_ms pp_provider_config
ppiopt batch_metrics ppiopt batch_logs batch_timeout_ms ppiopt traces pp_provider_config metrics pp_provider_config logs ppiopt
http_concurrency_level retry_max_attempts retry_initial_delay_ms http_concurrency_level retry_max_attempts retry_initial_delay_ms
retry_max_delay_ms retry_backoff_multiplier retry_max_delay_ms retry_backoff_multiplier
@ -124,9 +118,13 @@ type 'k make =
?url_traces:string -> ?url_traces:string ->
?url_metrics:string -> ?url_metrics:string ->
?url_logs:string -> ?url_logs:string ->
?batch_traces:int option -> ?batch_traces:int ->
?batch_metrics:int option -> ?batch_metrics:int ->
?batch_logs:int option -> ?batch_logs:int ->
?batch_timeout_ms:int ->
?traces:Opentelemetry.Provider_config.t ->
?metrics:Opentelemetry.Provider_config.t ->
?logs:Opentelemetry.Provider_config.t ->
?headers:(string * string) list -> ?headers:(string * string) list ->
?headers_traces:(string * string) list -> ?headers_traces:(string * string) list ->
?headers_metrics:(string * string) list -> ?headers_metrics:(string * string) list ->
@ -136,7 +134,6 @@ type 'k make =
?timeout_traces_ms:int -> ?timeout_traces_ms:int ->
?timeout_metrics_ms:int -> ?timeout_metrics_ms:int ->
?timeout_logs_ms:int -> ?timeout_logs_ms:int ->
?batch_timeout_ms:int ->
?self_trace:bool -> ?self_trace:bool ->
?http_concurrency_level:int -> ?http_concurrency_level:int ->
?retry_max_attempts:int -> ?retry_max_attempts:int ->
@ -155,22 +152,22 @@ open struct
| Some ("1" | "true") -> true | Some ("1" | "true") -> true
| _ -> false | _ -> false
let get_log_level_from_env () = let get_log_level_from_env () : log_level =
match Sys.getenv_opt "OTEL_LOG_LEVEL" with match Sys.getenv_opt "OTEL_LOG_LEVEL" with
| Some "none" -> Log_level_none | Some "none" -> None
| Some "error" -> Log_level_error | Some "error" -> Some Error
| Some "warn" -> Log_level_warn | Some "warn" -> Some Warning
| Some "info" -> Log_level_info | Some "info" -> Some Info
| Some "debug" -> Log_level_debug | Some "debug" -> Some Debug
| Some s -> | Some s ->
Printf.eprintf "warning: unknown log level %S, defaulting to info\n%!" s; Opentelemetry.Self_debug.log Warning (fun () ->
(* log in info level, so we at least don't miss warnings and errors *) Printf.sprintf "unknown log level %S, defaulting to info" s);
Log_level_info Some Info
| None -> | None ->
if get_debug_from_env () then if get_debug_from_env () then
Log_level_debug Some Debug
else else
Log_level_none Some Info
let get_sdk_disabled_from_env () = let get_sdk_disabled_from_env () =
match Sys.getenv_opt "OTEL_SDK_DISABLED" with match Sys.getenv_opt "OTEL_SDK_DISABLED" with
@ -241,16 +238,47 @@ module Env () : ENV = struct
let make k ?(debug = get_debug_from_env ()) let make k ?(debug = get_debug_from_env ())
?(log_level = get_log_level_from_env ()) ?(log_level = get_log_level_from_env ())
?(sdk_disabled = get_sdk_disabled_from_env ()) ?url ?url_traces ?(sdk_disabled = get_sdk_disabled_from_env ()) ?url ?url_traces
?url_metrics ?url_logs ?(batch_traces = Some 400) ?url_metrics ?url_logs ?batch_traces ?batch_metrics ?batch_logs
?(batch_metrics = Some 200) ?(batch_logs = Some 400) ?(batch_timeout_ms = 2_000) ?traces ?metrics ?logs
?(headers = get_general_headers_from_env ()) ?headers_traces ?(headers = get_general_headers_from_env ()) ?headers_traces
?headers_metrics ?headers_logs ?headers_metrics ?headers_logs
?(protocol = get_protocol_from_env "OTEL_EXPORTER_OTLP_PROTOCOL") ?(protocol = get_protocol_from_env "OTEL_EXPORTER_OTLP_PROTOCOL")
?(timeout_ms = get_timeout_from_env "OTEL_EXPORTER_OTLP_TIMEOUT" 10_000) ?(timeout_ms = get_timeout_from_env "OTEL_EXPORTER_OTLP_TIMEOUT" 10_000)
?timeout_traces_ms ?timeout_metrics_ms ?timeout_logs_ms ?timeout_traces_ms ?timeout_metrics_ms ?timeout_logs_ms
?(batch_timeout_ms = 2_000) ?(self_trace = false) ?http_concurrency_level ?(self_trace = false) ?http_concurrency_level ?(retry_max_attempts = 3)
?(retry_max_attempts = 3) ?(retry_initial_delay_ms = 100.) ?(retry_initial_delay_ms = 100.) ?(retry_max_delay_ms = 5000.)
?(retry_max_delay_ms = 5000.) ?(retry_backoff_multiplier = 2.0) = ?(retry_backoff_multiplier = 2.0) =
let batch_timeout_ = Mtime.Span.(batch_timeout_ms * ms) in
let traces =
match traces with
| Some t -> t
| None ->
let batch =
match batch_traces with
| Some b -> b
| None -> get_timeout_from_env "OTEL_BSP_MAX_EXPORT_BATCH_SIZE" 400
in
Opentelemetry.Provider_config.make ~batch ~timeout:batch_timeout_ ()
in
let metrics =
match metrics with
| Some m -> m
| None ->
let batch =
match batch_metrics with
| Some b -> b
| None -> get_timeout_from_env "OTEL_METRIC_EXPORT_INTERVAL" 200
in
Opentelemetry.Provider_config.make ~batch ~timeout:batch_timeout_ ()
in
let logs =
match logs with
| Some l -> l
| None ->
let batch = Option.value batch_logs ~default:400 in
Opentelemetry.Provider_config.make ~batch ~timeout:batch_timeout_ ()
in
let url_traces, url_metrics, url_logs = let url_traces, url_metrics, url_logs =
let base_url = let base_url =
let base_url = let base_url =
@ -343,10 +371,9 @@ module Env () : ENV = struct
timeout_traces_ms; timeout_traces_ms;
timeout_metrics_ms; timeout_metrics_ms;
timeout_logs_ms; timeout_logs_ms;
batch_traces; traces;
batch_metrics; metrics;
batch_logs; logs;
batch_timeout_ms;
self_trace; self_trace;
http_concurrency_level; http_concurrency_level;
retry_max_attempts; retry_max_attempts;

View file

@ -8,12 +8,9 @@ type protocol =
| Http_protobuf | Http_protobuf
| Http_json | Http_json
type log_level = type log_level = Opentelemetry.Self_debug.level option
| Log_level_none (** [None] disables internal diagnostic logging; [Some level] enables it at that
| Log_level_error level and above. Maps to [OTEL_LOG_LEVEL] env var. *)
| Log_level_warn
| Log_level_info
| Log_level_debug
type rest type rest
(** opaque type to force using {!make} while allowing record updates *) (** opaque type to force using {!make} while allowing record updates *)
@ -60,25 +57,17 @@ type t = {
timeout_logs_ms: int; timeout_logs_ms: int;
(** Timeout for log exports. Read from OTEL_EXPORTER_OTLP_LOGS_TIMEOUT, (** Timeout for log exports. Read from OTEL_EXPORTER_OTLP_LOGS_TIMEOUT,
falls back to timeout_ms. *) falls back to timeout_ms. *)
batch_traces: int option; traces: Opentelemetry.Provider_config.t;
(** Batch traces? If [Some i], then this produces batches of (at most) [i] (** Per-provider batching config for traces. Default: batch=400,
items. If [None], there is no batching. timeout=2s. The batch size is read from OTEL_BSP_MAX_EXPORT_BATCH_SIZE
if set. *)
Note that traces and metrics are batched separately. Default metrics: Opentelemetry.Provider_config.t;
[Some 400]. *) (** Per-provider batching config for metrics. Default: batch=200,
batch_metrics: int option; timeout=2s. The batch size is read from OTEL_METRIC_EXPORT_INTERVAL if
(** Batch metrics? If [Some i], then this produces batches of (at most) set. *)
[i] items. If [None], there is no batching. logs: Opentelemetry.Provider_config.t;
(** Per-provider batching config for logs. Default: batch=400, timeout=2s.
Note that traces and metrics are batched separately. Default *)
[Some 200]. *)
batch_logs: int option;
(** Batch logs? See {!batch_metrics} for details. Default [Some 400] *)
batch_timeout_ms: int;
(** Number of milliseconds after which we will emit a batch, even
incomplete. Note that the batch might take longer than that, because
this is only checked when a new event occurs or when a tick is
emitted. Default 2_000. *)
self_trace: bool; self_trace: bool;
(** If true, the OTEL library will perform some self-instrumentation. (** If true, the OTEL library will perform some self-instrumentation.
Default [false]. Default [false].
@ -117,9 +106,13 @@ type 'k make =
?url_traces:string -> ?url_traces:string ->
?url_metrics:string -> ?url_metrics:string ->
?url_logs:string -> ?url_logs:string ->
?batch_traces:int option -> ?batch_traces:int ->
?batch_metrics:int option -> ?batch_metrics:int ->
?batch_logs:int option -> ?batch_logs:int ->
?batch_timeout_ms:int ->
?traces:Opentelemetry.Provider_config.t ->
?metrics:Opentelemetry.Provider_config.t ->
?logs:Opentelemetry.Provider_config.t ->
?headers:(string * string) list -> ?headers:(string * string) list ->
?headers_traces:(string * string) list -> ?headers_traces:(string * string) list ->
?headers_metrics:(string * string) list -> ?headers_metrics:(string * string) list ->
@ -129,7 +122,6 @@ type 'k make =
?timeout_traces_ms:int -> ?timeout_traces_ms:int ->
?timeout_metrics_ms:int -> ?timeout_metrics_ms:int ->
?timeout_logs_ms:int -> ?timeout_logs_ms:int ->
?batch_timeout_ms:int ->
?self_trace:bool -> ?self_trace:bool ->
?http_concurrency_level:int -> ?http_concurrency_level:int ->
?retry_max_attempts:int -> ?retry_max_attempts:int ->

View file

@ -101,11 +101,11 @@ end = struct
(* sanity check about the queue, which should be drained *) (* sanity check about the queue, which should be drained *)
let size_q = Bounded_queue.Recv.size self.q in let size_q = Bounded_queue.Recv.size self.q in
if size_q > 0 then if size_q > 0 then
Printf.eprintf OTEL.Self_debug.log OTEL.Self_debug.Warning (fun () ->
"otel: warning: workers exited but work queue still contains %d \ Printf.sprintf
elements\n\ "otel: warning: workers exited but work queue still contains %d \
%!" elements"
size_q size_q)
) )
let send_signals (self : state) (sender : Sender.t) ~backoff let send_signals (self : state) (sender : Sender.t) ~backoff
@ -122,7 +122,8 @@ end = struct
Util_net_backoff.on_success backoff; Util_net_backoff.on_success backoff;
IO.return () IO.return ()
| Error `Sysbreak -> | Error `Sysbreak ->
Printf.eprintf "ctrl-c captured, stopping\n%!"; OTEL.Self_debug.log OTEL.Self_debug.Info (fun () ->
"ctrl-c captured, stopping");
shutdown self; shutdown self;
IO.return () IO.return ()
| Error err -> | Error err ->
@ -135,6 +136,7 @@ end = struct
let start_worker (self : state) : unit = let start_worker (self : state) : unit =
let sender = Sender.create ~config:self.sender_config () in let sender = Sender.create ~config:self.sender_config () in
let backoff = Util_net_backoff.create () in let backoff = Util_net_backoff.create () in
OTEL.Self_debug.log OTEL.Self_debug.Debug (fun () -> "otel worker started");
(* loop on [q] *) (* loop on [q] *)
let rec loop () : unit IO.t = let rec loop () : unit IO.t =