retry.go
1 // Copyright 2026 Alibaba Group Holding Ltd. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package opensandbox 16 17 import ( 18 "context" 19 "errors" 20 "math" 21 "math/rand" 22 "net" 23 "net/http" 24 "strconv" 25 "time" 26 ) 27 28 // RetryConfig controls automatic retry behavior for transient errors. 29 // A zero-value config disables retries. 30 type RetryConfig struct { 31 // MaxRetries is the maximum number of retry attempts after the initial 32 // request. 0 means no retries (only the original attempt). 33 MaxRetries int 34 35 // InitialBackoff is the delay before the first retry. 36 InitialBackoff time.Duration 37 38 // MaxBackoff caps the delay between retries. 39 MaxBackoff time.Duration 40 41 // Multiplier scales the backoff after each retry attempt. 42 Multiplier float64 43 44 // Jitter adds randomness to avoid thundering herd. Expressed as a 45 // fraction of the computed delay: 0.0 = no jitter, 0.25 = +/-25%. 46 Jitter float64 47 48 // RetryableStatusCodes optionally overrides which HTTP status codes are 49 // treated as transient for retry decisions. When empty, SDK defaults are 50 // used (429, 502, 503, 504). 51 RetryableStatusCodes []int 52 } 53 54 // DefaultRetryConfig returns a retry configuration suitable for most SDK 55 // consumers: 3 retries, 500ms initial backoff, 2x multiplier, 30s cap. 56 func DefaultRetryConfig() RetryConfig { 57 return RetryConfig{ 58 MaxRetries: 3, 59 InitialBackoff: 500 * time.Millisecond, 60 MaxBackoff: 30 * time.Second, 61 Multiplier: 2.0, 62 Jitter: 0.25, 63 RetryableStatusCodes: []int{ 64 http.StatusTooManyRequests, 65 http.StatusBadGateway, 66 http.StatusServiceUnavailable, 67 http.StatusGatewayTimeout, 68 }, 69 } 70 } 71 72 // WithRetry enables automatic retry with exponential backoff for transient 73 // errors and network failures. By default, transient status codes are 74 // 429/502/503/504; override RetryConfig.RetryableStatusCodes to customize. 75 func WithRetry(cfg RetryConfig) Option { 76 return func(c *Client) { 77 c.retry = &cfg 78 } 79 } 80 81 // IsTransient reports whether the API error represents a transient server 82 // condition that may succeed on retry. 83 func (e *APIError) IsTransient() bool { 84 return isTransientStatus(e.StatusCode) 85 } 86 87 // isTransientStatus classifies HTTP status codes. 88 // 89 // Retryable: 429 (rate limit), 502, 503, 504 (infrastructure). 90 // Permanent: everything else (400, 401, 403, 404, 409, 422, ...). 91 func isTransientStatus(code int) bool { 92 switch code { 93 case http.StatusTooManyRequests, 94 http.StatusBadGateway, 95 http.StatusServiceUnavailable, 96 http.StatusGatewayTimeout: 97 return true 98 default: 99 return false 100 } 101 } 102 103 func (r *RetryConfig) isRetryableStatus(code int) bool { 104 codes := r.RetryableStatusCodes 105 if len(codes) == 0 { 106 return isTransientStatus(code) 107 } 108 for _, c := range codes { 109 if c == code { 110 return true 111 } 112 } 113 return false 114 } 115 116 // isTransientError checks whether err should trigger a retry. It handles 117 // *APIError (HTTP status classification) and net.Error (network-level). 118 func isTransientError(err error, cfg *RetryConfig) bool { 119 if err == nil { 120 return false 121 } 122 var apiErr *APIError 123 if errors.As(err, &apiErr) { 124 if cfg != nil { 125 return cfg.isRetryableStatus(apiErr.StatusCode) 126 } 127 return apiErr.IsTransient() 128 } 129 var netErr net.Error 130 return errors.As(err, &netErr) 131 } 132 133 // backoff computes the delay for attempt n (0-indexed) with optional jitter. 134 func (r *RetryConfig) backoff(attempt int) time.Duration { 135 delay := float64(r.InitialBackoff) * math.Pow(r.Multiplier, float64(attempt)) 136 if delay > float64(r.MaxBackoff) { 137 delay = float64(r.MaxBackoff) 138 } 139 if r.Jitter > 0 { 140 jitter := delay * r.Jitter 141 delay = delay - jitter + rand.Float64()*2*jitter 142 } 143 return time.Duration(delay) 144 } 145 146 // retryDelay returns the backoff duration, respecting Retry-After if present. 147 func retryDelay(cfg *RetryConfig, attempt int, err error) time.Duration { 148 computed := cfg.backoff(attempt) 149 150 var apiErr *APIError 151 if errors.As(err, &apiErr) && apiErr.RetryAfter > 0 { 152 if apiErr.RetryAfter > computed { 153 return apiErr.RetryAfter 154 } 155 } 156 return computed 157 } 158 159 // parseRetryAfter extracts the Retry-After header value as a duration. 160 // Returns 0 if the header is absent or unparseable. 161 func parseRetryAfter(resp *http.Response) time.Duration { 162 if resp == nil { 163 return 0 164 } 165 val := resp.Header.Get("Retry-After") 166 if val == "" { 167 return 0 168 } 169 if secs, err := strconv.Atoi(val); err == nil && secs > 0 { 170 return time.Duration(secs) * time.Second 171 } 172 if t, err := http.ParseTime(val); err == nil { 173 if d := time.Until(t); d > 0 { 174 return d 175 } 176 } 177 return 0 178 } 179 180 // retrySleep waits for d or until ctx is cancelled. 181 func retrySleep(ctx context.Context, d time.Duration) error { 182 t := time.NewTimer(d) 183 defer t.Stop() 184 select { 185 case <-ctx.Done(): 186 return ctx.Err() 187 case <-t.C: 188 return nil 189 } 190 } 191 192 // withRetry executes fn, retrying on transient errors per the client's 193 // RetryConfig. If retry is nil or MaxRetries is 0, fn is called once. 194 func (c *Client) withRetry(ctx context.Context, fn func() error) error { 195 if c.retry == nil || c.retry.MaxRetries == 0 { 196 return fn() 197 } 198 199 var lastErr error 200 for attempt := 0; attempt <= c.retry.MaxRetries; attempt++ { 201 lastErr = fn() 202 if lastErr == nil { 203 return nil 204 } 205 if !isTransientError(lastErr, c.retry) { 206 return lastErr 207 } 208 if attempt == c.retry.MaxRetries { 209 break 210 } 211 delay := retryDelay(c.retry, attempt, lastErr) 212 if err := retrySleep(ctx, delay); err != nil { 213 return err 214 } 215 } 216 return lastErr 217 }