/ sdks / sandbox / go / retry.go
retry.go
  1  // Copyright 2026 Alibaba Group Holding Ltd.
  2  //
  3  // Licensed under the Apache License, Version 2.0 (the "License");
  4  // you may not use this file except in compliance with the License.
  5  // You may obtain a copy of the License at
  6  //
  7  //     http://www.apache.org/licenses/LICENSE-2.0
  8  //
  9  // Unless required by applicable law or agreed to in writing, software
 10  // distributed under the License is distributed on an "AS IS" BASIS,
 11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12  // See the License for the specific language governing permissions and
 13  // limitations under the License.
 14  
 15  package opensandbox
 16  
 17  import (
 18  	"context"
 19  	"errors"
 20  	"math"
 21  	"math/rand"
 22  	"net"
 23  	"net/http"
 24  	"strconv"
 25  	"time"
 26  )
 27  
 28  // RetryConfig controls automatic retry behavior for transient errors.
 29  // A zero-value config disables retries.
 30  type RetryConfig struct {
 31  	// MaxRetries is the maximum number of retry attempts after the initial
 32  	// request. 0 means no retries (only the original attempt).
 33  	MaxRetries int
 34  
 35  	// InitialBackoff is the delay before the first retry.
 36  	InitialBackoff time.Duration
 37  
 38  	// MaxBackoff caps the delay between retries.
 39  	MaxBackoff time.Duration
 40  
 41  	// Multiplier scales the backoff after each retry attempt.
 42  	Multiplier float64
 43  
 44  	// Jitter adds randomness to avoid thundering herd. Expressed as a
 45  	// fraction of the computed delay: 0.0 = no jitter, 0.25 = +/-25%.
 46  	Jitter float64
 47  
 48  	// RetryableStatusCodes optionally overrides which HTTP status codes are
 49  	// treated as transient for retry decisions. When empty, SDK defaults are
 50  	// used (429, 502, 503, 504).
 51  	RetryableStatusCodes []int
 52  }
 53  
 54  // DefaultRetryConfig returns a retry configuration suitable for most SDK
 55  // consumers: 3 retries, 500ms initial backoff, 2x multiplier, 30s cap.
 56  func DefaultRetryConfig() RetryConfig {
 57  	return RetryConfig{
 58  		MaxRetries:     3,
 59  		InitialBackoff: 500 * time.Millisecond,
 60  		MaxBackoff:     30 * time.Second,
 61  		Multiplier:     2.0,
 62  		Jitter:         0.25,
 63  		RetryableStatusCodes: []int{
 64  			http.StatusTooManyRequests,
 65  			http.StatusBadGateway,
 66  			http.StatusServiceUnavailable,
 67  			http.StatusGatewayTimeout,
 68  		},
 69  	}
 70  }
 71  
 72  // WithRetry enables automatic retry with exponential backoff for transient
 73  // errors and network failures. By default, transient status codes are
 74  // 429/502/503/504; override RetryConfig.RetryableStatusCodes to customize.
 75  func WithRetry(cfg RetryConfig) Option {
 76  	return func(c *Client) {
 77  		c.retry = &cfg
 78  	}
 79  }
 80  
 81  // IsTransient reports whether the API error represents a transient server
 82  // condition that may succeed on retry.
 83  func (e *APIError) IsTransient() bool {
 84  	return isTransientStatus(e.StatusCode)
 85  }
 86  
 87  // isTransientStatus classifies HTTP status codes.
 88  //
 89  //	Retryable: 429 (rate limit), 502, 503, 504 (infrastructure).
 90  //	Permanent: everything else (400, 401, 403, 404, 409, 422, ...).
 91  func isTransientStatus(code int) bool {
 92  	switch code {
 93  	case http.StatusTooManyRequests,
 94  		http.StatusBadGateway,
 95  		http.StatusServiceUnavailable,
 96  		http.StatusGatewayTimeout:
 97  		return true
 98  	default:
 99  		return false
100  	}
101  }
102  
103  func (r *RetryConfig) isRetryableStatus(code int) bool {
104  	codes := r.RetryableStatusCodes
105  	if len(codes) == 0 {
106  		return isTransientStatus(code)
107  	}
108  	for _, c := range codes {
109  		if c == code {
110  			return true
111  		}
112  	}
113  	return false
114  }
115  
116  // isTransientError checks whether err should trigger a retry. It handles
117  // *APIError (HTTP status classification) and net.Error (network-level).
118  func isTransientError(err error, cfg *RetryConfig) bool {
119  	if err == nil {
120  		return false
121  	}
122  	var apiErr *APIError
123  	if errors.As(err, &apiErr) {
124  		if cfg != nil {
125  			return cfg.isRetryableStatus(apiErr.StatusCode)
126  		}
127  		return apiErr.IsTransient()
128  	}
129  	var netErr net.Error
130  	return errors.As(err, &netErr)
131  }
132  
133  // backoff computes the delay for attempt n (0-indexed) with optional jitter.
134  func (r *RetryConfig) backoff(attempt int) time.Duration {
135  	delay := float64(r.InitialBackoff) * math.Pow(r.Multiplier, float64(attempt))
136  	if delay > float64(r.MaxBackoff) {
137  		delay = float64(r.MaxBackoff)
138  	}
139  	if r.Jitter > 0 {
140  		jitter := delay * r.Jitter
141  		delay = delay - jitter + rand.Float64()*2*jitter
142  	}
143  	return time.Duration(delay)
144  }
145  
146  // retryDelay returns the backoff duration, respecting Retry-After if present.
147  func retryDelay(cfg *RetryConfig, attempt int, err error) time.Duration {
148  	computed := cfg.backoff(attempt)
149  
150  	var apiErr *APIError
151  	if errors.As(err, &apiErr) && apiErr.RetryAfter > 0 {
152  		if apiErr.RetryAfter > computed {
153  			return apiErr.RetryAfter
154  		}
155  	}
156  	return computed
157  }
158  
159  // parseRetryAfter extracts the Retry-After header value as a duration.
160  // Returns 0 if the header is absent or unparseable.
161  func parseRetryAfter(resp *http.Response) time.Duration {
162  	if resp == nil {
163  		return 0
164  	}
165  	val := resp.Header.Get("Retry-After")
166  	if val == "" {
167  		return 0
168  	}
169  	if secs, err := strconv.Atoi(val); err == nil && secs > 0 {
170  		return time.Duration(secs) * time.Second
171  	}
172  	if t, err := http.ParseTime(val); err == nil {
173  		if d := time.Until(t); d > 0 {
174  			return d
175  		}
176  	}
177  	return 0
178  }
179  
180  // retrySleep waits for d or until ctx is cancelled.
181  func retrySleep(ctx context.Context, d time.Duration) error {
182  	t := time.NewTimer(d)
183  	defer t.Stop()
184  	select {
185  	case <-ctx.Done():
186  		return ctx.Err()
187  	case <-t.C:
188  		return nil
189  	}
190  }
191  
192  // withRetry executes fn, retrying on transient errors per the client's
193  // RetryConfig. If retry is nil or MaxRetries is 0, fn is called once.
194  func (c *Client) withRetry(ctx context.Context, fn func() error) error {
195  	if c.retry == nil || c.retry.MaxRetries == 0 {
196  		return fn()
197  	}
198  
199  	var lastErr error
200  	for attempt := 0; attempt <= c.retry.MaxRetries; attempt++ {
201  		lastErr = fn()
202  		if lastErr == nil {
203  			return nil
204  		}
205  		if !isTransientError(lastErr, c.retry) {
206  			return lastErr
207  		}
208  		if attempt == c.retry.MaxRetries {
209  			break
210  		}
211  		delay := retryDelay(c.retry, attempt, lastErr)
212  		if err := retrySleep(ctx, delay); err != nil {
213  			return err
214  		}
215  	}
216  	return lastErr
217  }