From e9485c97da017c7fc811d5bc31d3b8ec204be2fb Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sat, 14 Feb 2026 13:27:01 +0000 Subject: [PATCH] Add retry with exponential backoff to HTTP client - Add retry_max_attempts, retry_initial_delay_ms, retry_max_delay_ms, retry_backoff_multiplier to config - Retry on network failures, 5xx errors, 429 (rate limit), 408 (timeout) - No retry on 4xx client errors or user interrupt (Sysbreak) - Default: 3 attempts, 100ms initial delay, 5s max delay, 2x multiplier - Uses existing IO.sleep_s from generic_io --- src/client/exporter_config.ml | 26 ++++++++++++++++++++--- src/client/exporter_config.mli | 13 ++++++++++++ src/client/generic_http_consumer.ml | 33 ++++++++++++++++++++++++++++- 3 files changed, 68 insertions(+), 4 deletions(-) diff --git a/src/client/exporter_config.ml b/src/client/exporter_config.ml index c43d846c..93c3a89f 100644 --- a/src/client/exporter_config.ml +++ b/src/client/exporter_config.ml @@ -33,6 +33,10 @@ type t = { batch_timeout_ms: int; self_trace: bool; http_concurrency_level: int option; + retry_max_attempts: int; + retry_initial_delay_ms: float; + retry_max_delay_ms: float; + retry_backoff_multiplier: float; _rest: rest; } @@ -82,6 +86,10 @@ let pp out (self : t) : unit = batch_logs; batch_timeout_ms; http_concurrency_level; + retry_max_attempts; + retry_initial_delay_ms; + retry_max_delay_ms; + retry_backoff_multiplier; _rest = _; } = self @@ -95,13 +103,16 @@ let pp out (self : t) : unit = %a@];@ protocol=%a;@ timeout_ms=%d;@ timeout_traces_ms=%d;@ \ timeout_metrics_ms=%d;@ timeout_logs_ms=%d;@ batch_traces=%a;@ \ batch_metrics=%a;@ batch_logs=%a;@ batch_timeout_ms=%d;@ \ - http_concurrency_level=%a @]}" + http_concurrency_level=%a;@ retry_max_attempts=%d;@ \ + retry_initial_delay_ms=%.0f;@ retry_max_delay_ms=%.0f;@ \ + retry_backoff_multiplier=%.1f @]}" debug pp_log_level log_level sdk_disabled self_trace url_traces url_metrics url_logs ppheaders headers ppheaders headers_traces ppheaders headers_metrics ppheaders headers_logs pp_protocol protocol timeout_ms timeout_traces_ms timeout_metrics_ms timeout_logs_ms ppiopt batch_traces ppiopt batch_metrics ppiopt batch_logs batch_timeout_ms ppiopt - http_concurrency_level + http_concurrency_level retry_max_attempts retry_initial_delay_ms + retry_max_delay_ms retry_backoff_multiplier let default_url = "http://localhost:4318" @@ -128,6 +139,10 @@ type 'k make = ?batch_timeout_ms:int -> ?self_trace:bool -> ?http_concurrency_level:int -> + ?retry_max_attempts:int -> + ?retry_initial_delay_ms:float -> + ?retry_max_delay_ms:float -> + ?retry_backoff_multiplier:float -> 'k module type ENV = sig @@ -234,7 +249,8 @@ module Env () : ENV = struct ?(timeout_ms = get_timeout_from_env "OTEL_EXPORTER_OTLP_TIMEOUT" 10_000) ?timeout_traces_ms ?timeout_metrics_ms ?timeout_logs_ms ?(batch_timeout_ms = 2_000) ?(self_trace = false) ?http_concurrency_level - = + ?(retry_max_attempts = 3) ?(retry_initial_delay_ms = 100.) + ?(retry_max_delay_ms = 5000.) ?(retry_backoff_multiplier = 2.0) = let url_traces, url_metrics, url_logs = let base_url = let base_url = @@ -333,6 +349,10 @@ module Env () : ENV = struct batch_timeout_ms; self_trace; http_concurrency_level; + retry_max_attempts; + retry_initial_delay_ms; + retry_max_delay_ms; + retry_backoff_multiplier; _rest = (); } end diff --git a/src/client/exporter_config.mli b/src/client/exporter_config.mli index 40f1ccba..ef09ce05 100644 --- a/src/client/exporter_config.mli +++ b/src/client/exporter_config.mli @@ -88,6 +88,15 @@ type t = { be used to represent the size of a pool of workers where each worker gets a batch to send, send it, and repeats. @since NEXT_RELEASE *) + retry_max_attempts: int; + (** Maximum number of retry attempts for failed exports. 0 means no retry, + 1 means one retry after initial failure. Default 3. *) + retry_initial_delay_ms: float; + (** Initial delay in milliseconds before first retry. Default 100ms. *) + retry_max_delay_ms: float; + (** Maximum delay in milliseconds between retries. Default 5000ms. *) + retry_backoff_multiplier: float; + (** Multiplier for exponential backoff. Default 2.0. *) _rest: rest; } (** Configuration. @@ -123,6 +132,10 @@ type 'k make = ?batch_timeout_ms:int -> ?self_trace:bool -> ?http_concurrency_level:int -> + ?retry_max_attempts:int -> + ?retry_initial_delay_ms:float -> + ?retry_max_delay_ms:float -> + ?retry_backoff_multiplier:float -> 'k (** A function that gathers all the values needed to construct a {!t}, and produces a ['k]. ['k] is typically a continuation used to construct a diff --git a/src/client/generic_http_consumer.ml b/src/client/generic_http_consumer.ml index 9256fe4b..083722e2 100644 --- a/src/client/generic_http_consumer.ml +++ b/src/client/generic_http_consumer.ml @@ -62,6 +62,13 @@ end = struct let send (self : t) (sigs : OTEL.Any_signal_l.t) : (unit, error) result IO.t = + let should_retry = function + | `Failure _ -> true (* Network errors, connection issues *) + | `Status (code, _) -> + (* Retry on server errors, rate limits, timeouts *) + code >= 500 || code = 429 || code = 408 + | `Sysbreak -> false (* User interrupt, don't retry *) + in let res = Resource_signal.of_signal_l sigs in let url, signal_headers = match res with @@ -78,7 +85,31 @@ end = struct in let headers = List.rev_append signal_headers filtered_general in let data = Resource_signal.Encode.any ~encoder:self.encoder res in - Httpc.send self.http ~url ~headers ~decode:(`Ret ()) data + + (* Retry loop with exponential backoff *) + let rec retry_send attempt delay_ms = + let open IO in + let* result = + Httpc.send self.http ~url ~headers ~decode:(`Ret ()) data + in + match result with + | Ok x -> return (Ok x) + | Error err + when should_retry err && attempt < self.config.retry_max_attempts -> + let delay_s = delay_ms /. 1000. in + let* () = sleep_s delay_s in + let next_delay = + min self.config.retry_max_delay_ms + (delay_ms *. self.config.retry_backoff_multiplier) + in + retry_send (attempt + 1) next_delay + | Error _ as err -> return err + in + + if self.config.retry_max_attempts > 0 then + retry_send 0 self.config.retry_initial_delay_ms + else + Httpc.send self.http ~url ~headers ~decode:(`Ret ()) data end module C = Generic_consumer.Make (IO) (Notifier) (Sender)