/ crates / tor-log-ratelim / src / macros.rs
macros.rs
  1  //! Macros to make it easier to work with rate-limited logs
  2  //!
  3  
  4  /// Log a rate-limited failure message.
  5  ///
  6  /// This macro looks at a single `Result<T,E>`,
  7  /// and keeps track of how many times the `Ok` and `Err` branches are seen.
  8  /// After a delay, it reports via [`tracing::event!`]
  9  /// how many errors it has seen since its last report.
 10  /// (It only reports an absence of errors
 11  /// when such an absence has followed
 12  /// an error report.)
 13  ///
 14  /// ## A simple example
 15  ///
 16  /// ```
 17  /// # use std::num::ParseIntError;
 18  /// # let s = "1234";
 19  /// # let source = "the cache";
 20  /// use tor_log_ratelim::log_ratelim;
 21  ///  let r: Result<u8, ParseIntError> = s.parse();
 22  ///
 23  /// log_ratelim!(
 24  ///   // The activity we were doing.
 25  ///   "Parsing a value from {}", source;
 26  ///   // The result we got.
 27  ///   r;
 28  /// );
 29  /// ```
 30  ///
 31  /// This invocation could report errors like
 32  /// ```text
 33  /// WARN: Parsing a value from the cache: error (occurred 9/12 times in the last 5 minutes): number too large to fit in target type
 34  /// ```
 35  ///
 36  /// After a while without errors, it might log:
 37  /// ```text
 38  /// WARN: Parsing a value from the cache: now working (occurred 0/100 times in th last hour)
 39  /// ```
 40  ///
 41  /// ## Important concept: Activities
 42  ///
 43  /// Every invocation of `log_ratelim!` defines a _set_ of rate limits
 44  /// with respect to a collection of **activities**.  
 45  /// Each separate **activity** value gets its own rate limit.
 46  /// This lets you have separate rate limits for different operations,
 47  /// such as connecting to different parties,
 48  /// or invoking different programs.
 49  ///
 50  /// Typical activities might be
 51  /// `"Connecting to port {}", p`
 52  /// or
 53  /// `"Trying to start program {}", p`
 54  ///
 55  /// (These activities should be described using a verb ending with "-ing",
 56  /// to make the output nice.)
 57  ///
 58  /// ## Requirements on the error type.
 59  ///
 60  /// The error type for each `Result` passed to this macro must implement:
 61  ///  * [`Clone`]
 62  ///  * [`Send`]
 63  ///  * [`Error`](std::error::Error)
 64  ///
 65  /// ## Reports are representative
 66  ///
 67  /// No matter how many failures are seen between log messages,
 68  /// `log_ratelim!` only records and reports
 69  /// one error for each time it logs.
 70  ///
 71  /// Its current behavior is to record and report
 72  /// the _first_ error for each logged interval,
 73  /// and discard the others.  
 74  /// This can lead to confusing results if the error is not representative.
 75  ///
 76  /// ## Advanced syntax
 77  ///
 78  /// The `log_ratelim` macro can record additional information for its
 79  /// representative error report,
 80  /// and can log information on successes as well.
 81  ///
 82  /// A full invocation of `log_ratelim!` looks like this:
 83  ///
 84  /// ```
 85  /// # use std::num::ParseIntError;
 86  /// # let s = "1234";
 87  /// # let source = "the cache";
 88  /// # let more_information = |_| "";
 89  /// use tor_log_ratelim::log_ratelim;
 90  /// let r: Result<u8, ParseIntError> = s.parse();
 91  /// log_ratelim!(
 92  ///   "Parsing a value from {}", source;
 93  ///   r;
 94  ///   Err(x) => WARN, "The problem was {}", more_information(x);
 95  ///   Ok(v) => TRACE, "Parsed {} successfully", v;
 96  /// );
 97  /// ```
 98  ///
 99  /// Here the clause starting with `Err(x)`
100  /// tells the logger to include a message along with the error report,
101  /// and we explicitly specifies the level at which
102  /// to log our failure reports.
103  ///
104  /// Note that the error itself is **always** reported;
105  /// there is no need to say
106  /// `Err(e) => WARN, "{}", e`.
107  /// In fact, doing so will create a redundant report of
108  /// the error.
109  //
110  // TODO: I don't think it makes sense to have an Ok() logger.
111  // Instead, we could just say
112  //    log_ratelim!("Parsing a value from {}", source; r;);
113  //    trace!("parsed value from {}: {:?}", source, r);
114  // This is probably better, since it logs a trace for every occurrence.
115  //
116  /// The clause starting with `Ok(v)` tells the logger what to do on success:
117  /// each individual success causes a _non-rate-limited_
118  /// message at TRACE level.
119  ///
120  /// The `Ok() ...` clause
121  /// and the `Err() ...` clause
122  /// are both optional.
123  ///
124  /// Within the Err() clause,
125  /// the format string and its arguments
126  /// are optional.
127  //
128  // TODO performance notes:
129  //
130  // There are many opportunities for possibly making this code go faster:
131  //  - Optimize the case where there is only one activity.
132  //  - Use a non-string key to distinguish activities, to avoid formatting
133  //    the activity string needlessly.
134  //  - Limit copies (of activity and of error).
135  //  - Use Event and Metadata from the tracing crate to defer formatting
136  //  - Check Metadata early for the case where we don't want to report the
137  //    event at all.
138  //
139  // Let's not pursue any of these until we know that this code actually
140  // shows up in a critical path.
141  #[macro_export]
142  macro_rules! log_ratelim {
143    // ====
144    // Actual implementation for rate-limited logging.
145    // ====
146  
147    // Nobody invokes this syntax directly; it's used as a common body for the
148    // various other syntaxes.
149    {
150      @impl activity_format: ( $act_fmt:literal $(, $act_arg:expr)* ) ;
151            result: ($result:expr ) ;
152            on_error: (Err($err_pat:pat), $err_level:ident $(, $err_fmt:literal $(, $err_arg:expr)* )? );
153            $( on_ok: (Ok($ok_pat:pat), $ok_level:ident, $ok_fmt:literal $(, $ok_arg:expr)*  ); )?
154    } => {
155      #[allow(clippy::redundant_closure_call)]
156      (||{
157      use $crate::macro_prelude::*;
158      let Some(runtime) = rt_support() else {
159        // Nobody has called `install_runtime()`: we should just log whatever
160        // happened and not worry about the rate-limiting.
161        match &$result {
162          #[allow(clippy::redundant_pattern)]
163          Err(ref the_error @ $err_pat) => {
164            tracing::event!(
165              tracing::Level::$err_level,
166              concat!($act_fmt, $(": ", $err_fmt, )? ": {}"),
167              $($act_arg,)*
168              $( $($err_arg, )* )?
169              the_error.report()
170            );
171          }
172          $(Ok($ok_pat) => {
173            tracing::event!(
174              tracing::Level::$ok_level,
175              $ok_fmt
176              $(, $ok_arg)*
177            );
178          })?
179          #[allow(unreachable_patterns)]
180          Ok(_) => {}
181        }
182        return;
183      };
184  
185      /// An implementation of Loggable for this log message.
186      //
187      // We use a separate implementation here so that the tracing metadata will get
188      // constructed correctly.  If we called tracing::event! from a location in
189      // `tor-log-ratelim`, all the messages would appear to originate from there.
190      //
191      // (TODO: We could use tracing::Metadata explicitly, perhaps? That might be hard.)
192      struct Lg(LogState);
193      impl Loggable for Lg {
194          fn flush(&mut self, summarizing: std::time::Duration) -> Activity {
195              let activity = self.0.activity();
196              match activity {
197                 Activity::Active => {
198                    tracing::event!(
199                        tracing::Level::$err_level,
200                        "{}",
201                        self.0.display_problem(summarizing)
202                    );
203                 }
204                 Activity::Dormant => {
205                    tracing::event!(
206                        // Using err_level here is in some respects confusing:
207                        // if the _presence_ of the problem is (say) a WARN,
208                        // why should its newfound absence also be a WARN?
209                        //
210                        // We have had to decide which is worse:
211                        // that a user only watching WARNs
212                        // might not see a problem has gone away,
213                        // or that a non-problem would be reported
214                        // at an excessive severity.
215                        // We went with the latter.
216                        tracing::Level::$err_level,
217                        "{}",
218                        self.0.display_recovery(summarizing)
219                    );
220                 }
221              }
222              self.0.reset();
223              activity
224          }
225      }
226  
227      /// A lazy map from activity keys to weak RateLim handles.
228      //
229      // The strong reference for each RateLim is held by a task that flushes
230      // the logger as appropriate, and drops the strong reference once it's
231      // quiescent.
232      static LOGGERS: Lazy<Mutex<WeakValueHashMap<String, Weak<RateLim<Lg>>>>> =
233        Lazy::new(|| Mutex::new(WeakValueHashMap::new()));
234  
235      // We assign a separate rate limit for each activity.
236      // For now, this is string-ly typed.
237      let activity = format!($act_fmt $(, $act_arg)*);
238      let key = activity.clone();
239  
240      match &$result {
241        #[allow(clippy::redundant_pattern)]
242        Err(ref the_error @ $err_pat) => {
243          // The operation failed.
244          //
245          // 1) Create a rate-limited logger for this activity if one  did not
246          //    already exist.
247          let logger = LOGGERS
248            .lock()
249            .expect("poisoned lock")
250            .entry(key)
251            .or_insert_with(|| RateLim::new(Lg(LogState::new(activity))));
252          // 2) Note failure in the activity with note_fail().
253          logger.event(runtime, |lg| lg.0.note_fail(||
254            // 2b) If this is the first time that this activity failed since the
255            //     last flush, record the formatted err_msg, and a Clone of the error.
256            (
257              $crate::log_ratelim!{@first_nonempty
258                { $( Some(format!($err_fmt $(, $err_arg)* )) )? }
259                { None }
260              },
261              Some(Box::new(the_error.clone()))
262            )
263          ));
264        }
265        Ok($crate::log_ratelim!{@first_nonempty { $($ok_pat)? } {_} }) => {
266          // The operation succeeded.
267          //
268          // 1) If this activity is tracked, call note_ok() on it.
269          if let Some(logger) = LOGGERS
270            .lock()
271            .expect("poisoned lock")
272            .get(&key) {
273              logger.nonevent(|lg| lg.0.note_ok());
274            }
275          // 2) If we have a per-success item to log, log it.
276          $(
277          tracing::event!(tracing::Level::$ok_level, $ok_fmt $(, $ok_arg )* );
278          )?
279        }
280      }
281    })()
282    };
283  
284    // ======
285    // Exposed, documented syntax.
286    // ======
287  
288    // Regular invocation with an Err(_) case.
289    {
290      $act_fmt:literal $(, $act_arg:expr )* $(,)? ;
291      $result:expr ;
292      Err($err_pat:pat) => $err_level:ident $(, $err_fmt:literal $(, $err_arg:expr)* )? $(,)?
293      $(; Ok($ok_pat:pat) => $ok_level:ident, $ok_fmt:literal $(, $ok_arg:expr )* $(,)?  )?
294      $(;)?
295    } => {
296      $crate::log_ratelim!{
297        @impl
298          activity_format: ( $act_fmt $(, $act_arg)* );
299          result: ($result);
300          on_error: (Err($err_pat), $err_level $(, $err_fmt $(, $err_arg)* )? );
301          $( on_ok: ( Ok($ok_pat), $ok_level, $ok_fmt $(, $ok_arg)* ); )?
302      }
303    };
304    // Regular invocation with no Err(_) case.
305    {
306      $act_fmt:literal $(, $act_arg:expr )* $(,)? ;
307      $result:expr
308      $(; Ok($ok_pat:pat) => $ok_level:ident, $ok_fmt:literal $(, $ok_arg:expr )* $(,)? )?
309      $(;)?
310    } => {
311      $crate::log_ratelim!{
312        @impl
313          activity_format: ( $act_fmt $(, $act_arg)* );
314          result: ($result);
315          on_error: (Err(_), WARN);
316          $( on_ok: ( Ok($ok_pat), $ok_level, $ok_fmt $(, $ok_arg)* ); )?
317      }
318    };
319  
320    // Expand to the first of two bodies that has at least one token in it.
321    { @first_nonempty { $($a:tt)+ } { $($b:tt)* }} => { $($a)+ };
322    { @first_nonempty { } { $($b:tt)* } } => { $($b)+ };
323  }
324  
325  #[cfg(test)]
326  mod test_syntax {
327      #![allow(dead_code)]
328  
329      #[derive(Clone, Debug, thiserror::Error)]
330      enum MyErr {
331          #[error("it didn't work")]
332          DidntWork,
333      }
334      impl MyErr {
335          fn badness(&self) -> u8 {
336              3
337          }
338      }
339  
340      /// This doesn't actually run or test anything; it just makes sure that all
341      /// the different syntaxes work.
342      fn various_syntaxes(friend: &str, r: &Result<u32, MyErr>) {
343          log_ratelim!(
344            "saying hi to {}", friend;
345            r;
346          );
347  
348          log_ratelim!(
349            "saying hi to {}", friend;
350            r;
351            Err(_) => WARN;
352          );
353  
354          log_ratelim!(
355            "saying hi to {}", friend;
356            r;
357            Err(e) => WARN, "badness={}", e.badness();
358          );
359  
360          log_ratelim!(
361            "saying hi to {}", friend;
362            r;
363            Ok(v) => TRACE, "nothing bad happened; v={}", v;
364          );
365  
366          log_ratelim!(
367            "saying hi to {}", friend;
368            r;
369            Ok(v) => TRACE, "nothing bad happened; v={}", v;
370          );
371  
372          log_ratelim!(
373            "saying hi to {}", friend;
374            r;
375            Err(e) => WARN, "badness={}", e.badness();
376            Ok(v) => TRACE, "nothing bad happened; v={}", v;
377          );
378      }
379  }