/ scripts / contributor_audit.py
contributor_audit.py
  1  #!/usr/bin/env python3
  2  """Contributor Audit Script
  3  
  4  Cross-references git authors, Co-authored-by trailers, and salvaged PR
  5  descriptions to find any contributors missing from the release notes.
  6  
  7  Usage:
  8      # Basic audit since a tag
  9      python scripts/contributor_audit.py --since-tag v2026.4.8
 10  
 11      # Audit with a custom endpoint
 12      python scripts/contributor_audit.py --since-tag v2026.4.8 --until v2026.4.13
 13  
 14      # Compare against a release notes file
 15      python scripts/contributor_audit.py --since-tag v2026.4.8 --release-file RELEASE_v0.9.0.md
 16  """
 17  
 18  import argparse
 19  import json
 20  import os
 21  import re
 22  import subprocess
 23  import sys
 24  from collections import defaultdict
 25  from pathlib import Path
 26  
 27  # ---------------------------------------------------------------------------
 28  # Import AUTHOR_MAP and resolve_author from the sibling release.py module
 29  # ---------------------------------------------------------------------------
 30  SCRIPT_DIR = Path(__file__).resolve().parent
 31  sys.path.insert(0, str(SCRIPT_DIR))
 32  
 33  from release import AUTHOR_MAP, resolve_author  # noqa: E402
 34  
 35  REPO_ROOT = SCRIPT_DIR.parent
 36  
 37  # ---------------------------------------------------------------------------
 38  # AI assistants, bots, and machine accounts to exclude from contributor lists
 39  # ---------------------------------------------------------------------------
 40  IGNORED_PATTERNS = [
 41      re.compile(r"^Claude", re.IGNORECASE),
 42      re.compile(r"^Copilot$", re.IGNORECASE),
 43      re.compile(r"^Cursor\s+Agent$", re.IGNORECASE),
 44      re.compile(r"^GitHub\s*Actions?$", re.IGNORECASE),
 45      re.compile(r"^dependabot", re.IGNORECASE),
 46      re.compile(r"^renovate", re.IGNORECASE),
 47      re.compile(r"^Hermes\s+(Agent|Audit)$", re.IGNORECASE),
 48      re.compile(r"^Ubuntu$", re.IGNORECASE),
 49  ]
 50  
 51  IGNORED_EMAILS = {
 52      "noreply@anthropic.com",
 53      "noreply@github.com",
 54      "cursoragent@cursor.com",
 55      "hermes@nousresearch.com",
 56      "hermes-audit@example.com",
 57      "hermes@habibilabs.dev",
 58  }
 59  
 60  
 61  def is_ignored(handle: str, email: str = "") -> bool:
 62      """Return True if this contributor is a bot/AI/machine account."""
 63      if email in IGNORED_EMAILS:
 64          return True
 65      for pattern in IGNORED_PATTERNS:
 66          if pattern.search(handle):
 67              return True
 68      return False
 69  
 70  
 71  # ---------------------------------------------------------------------------
 72  # Helpers
 73  # ---------------------------------------------------------------------------
 74  
 75  def git(*args, cwd=None):
 76      """Run a git command and return stdout."""
 77      result = subprocess.run(
 78          ["git"] + list(args),
 79          capture_output=True,
 80          text=True,
 81          cwd=cwd or str(REPO_ROOT),
 82      )
 83      if result.returncode != 0:
 84          print(f"  [warn] git {' '.join(args)} failed: {result.stderr.strip()}", file=sys.stderr)
 85          return ""
 86      return result.stdout.strip()
 87  
 88  
 89  def gh_pr_list():
 90      """Fetch merged PRs from GitHub using the gh CLI.
 91  
 92      Returns a list of dicts with keys: number, title, body, author.
 93      Returns an empty list if gh is not available or the call fails.
 94      """
 95      try:
 96          result = subprocess.run(
 97              [
 98                  "gh", "pr", "list",
 99                  "--repo", "NousResearch/hermes-agent",
100                  "--state", "merged",
101                  "--json", "number,title,body,author,mergedAt",
102                  "--limit", "300",
103              ],
104              capture_output=True,
105              text=True,
106              timeout=60,
107          )
108          if result.returncode != 0:
109              print(f"  [warn] gh pr list failed: {result.stderr.strip()}", file=sys.stderr)
110              return []
111          return json.loads(result.stdout)
112      except FileNotFoundError:
113          print("  [warn] 'gh' CLI not found — skipping salvaged PR scan.", file=sys.stderr)
114          return []
115      except subprocess.TimeoutExpired:
116          print("  [warn] gh pr list timed out — skipping salvaged PR scan.", file=sys.stderr)
117          return []
118      except json.JSONDecodeError:
119          print("  [warn] gh pr list returned invalid JSON — skipping salvaged PR scan.", file=sys.stderr)
120          return []
121  
122  
123  # ---------------------------------------------------------------------------
124  # Contributor collection
125  # ---------------------------------------------------------------------------
126  
127  # Patterns that indicate salvaged/cherry-picked/co-authored work in PR bodies
128  SALVAGE_PATTERNS = [
129      # "Salvaged from @username" or "Salvaged from #123"
130      re.compile(r"[Ss]alvaged\s+from\s+@(\w[\w-]*)"),
131      re.compile(r"[Ss]alvaged\s+from\s+#(\d+)"),
132      # "Cherry-picked from @username"
133      re.compile(r"[Cc]herry[- ]?picked\s+from\s+@(\w[\w-]*)"),
134      # "Based on work by @username"
135      re.compile(r"[Bb]ased\s+on\s+work\s+by\s+@(\w[\w-]*)"),
136      # "Original PR by @username"
137      re.compile(r"[Oo]riginal\s+PR\s+by\s+@(\w[\w-]*)"),
138      # "Co-authored with @username"
139      re.compile(r"[Cc]o[- ]?authored\s+with\s+@(\w[\w-]*)"),
140  ]
141  
142  # Pattern for Co-authored-by trailers in commit messages
143  CO_AUTHORED_RE = re.compile(
144      r"Co-authored-by:\s*(.+?)\s*<([^>]+)>",
145      re.IGNORECASE,
146  )
147  
148  
149  def collect_commit_authors(since_tag, until="HEAD"):
150      """Collect contributors from git commit authors.
151  
152      Returns:
153          contributors: dict mapping github_handle -> set of source labels
154          unknown_emails: dict mapping email -> git name (for emails not in AUTHOR_MAP)
155      """
156      range_spec = f"{since_tag}..{until}"
157      log = git(
158          "log", range_spec,
159          "--format=%H|%an|%ae|%s",
160          "--no-merges",
161      )
162  
163      contributors = defaultdict(set)
164      unknown_emails = {}
165  
166      if not log:
167          return contributors, unknown_emails
168  
169      for line in log.split("\n"):
170          if not line.strip():
171              continue
172          parts = line.split("|", 3)
173          if len(parts) != 4:
174              continue
175          _sha, name, email, _subject = parts
176  
177          handle = resolve_author(name, email)
178          # resolve_author returns "@handle" or plain name
179          if handle.startswith("@"):
180              contributors[handle.lstrip("@")].add("commit")
181          else:
182              # Could not resolve — record as unknown
183              contributors[handle].add("commit")
184              unknown_emails[email] = name
185  
186      return contributors, unknown_emails
187  
188  
189  def collect_co_authors(since_tag, until="HEAD"):
190      """Collect contributors from Co-authored-by trailers in commit messages.
191  
192      Returns:
193          contributors: dict mapping github_handle -> set of source labels
194          unknown_emails: dict mapping email -> git name
195      """
196      range_spec = f"{since_tag}..{until}"
197      # Get full commit messages to scan for trailers
198      log = git(
199          "log", range_spec,
200          "--format=__COMMIT__%H%n%b",
201          "--no-merges",
202      )
203  
204      contributors = defaultdict(set)
205      unknown_emails = {}
206  
207      if not log:
208          return contributors, unknown_emails
209  
210      for line in log.split("\n"):
211          match = CO_AUTHORED_RE.search(line)
212          if match:
213              name = match.group(1).strip()
214              email = match.group(2).strip()
215              handle = resolve_author(name, email)
216              if handle.startswith("@"):
217                  contributors[handle.lstrip("@")].add("co-author")
218              else:
219                  contributors[handle].add("co-author")
220                  unknown_emails[email] = name
221  
222      return contributors, unknown_emails
223  
224  
225  def collect_salvaged_contributors(since_tag, until="HEAD"):
226      """Scan merged PR bodies for salvage/cherry-pick/co-author attribution.
227  
228      Uses the gh CLI to fetch PRs, then filters to the date range defined
229      by since_tag..until and scans bodies for salvage patterns.
230  
231      Returns:
232          contributors: dict mapping github_handle -> set of source labels
233          pr_refs: dict mapping github_handle -> list of PR numbers where found
234      """
235      contributors = defaultdict(set)
236      pr_refs = defaultdict(list)
237  
238      # Determine the date range from git tags/refs
239      since_date = git("log", "-1", "--format=%aI", since_tag)
240      if until == "HEAD":
241          until_date = git("log", "-1", "--format=%aI", "HEAD")
242      else:
243          until_date = git("log", "-1", "--format=%aI", until)
244  
245      if not since_date:
246          print(f"  [warn] Could not resolve date for {since_tag}", file=sys.stderr)
247          return contributors, pr_refs
248  
249      prs = gh_pr_list()
250      if not prs:
251          return contributors, pr_refs
252  
253      for pr in prs:
254          # Filter by merge date if available
255          merged_at = pr.get("mergedAt", "")
256          if merged_at and since_date:
257              if merged_at < since_date:
258                  continue
259              if until_date and merged_at > until_date:
260                  continue
261  
262          body = pr.get("body") or ""
263          pr_number = pr.get("number", "?")
264  
265          # Also credit the PR author
266          pr_author = pr.get("author", {})
267          pr_author_login = pr_author.get("login", "") if isinstance(pr_author, dict) else ""
268  
269          for pattern in SALVAGE_PATTERNS:
270              for match in pattern.finditer(body):
271                  value = match.group(1)
272                  # If it's a number, it's a PR reference — skip for now
273                  # (would need another API call to resolve PR author)
274                  if value.isdigit():
275                      continue
276                  contributors[value].add("salvage")
277                  pr_refs[value].append(pr_number)
278  
279      return contributors, pr_refs
280  
281  
282  # ---------------------------------------------------------------------------
283  # Release file comparison
284  # ---------------------------------------------------------------------------
285  
286  def check_release_file(release_file, all_contributors):
287      """Check which contributors are mentioned in the release file.
288  
289      Returns:
290          mentioned: set of handles found in the file
291          missing: set of handles NOT found in the file
292      """
293      try:
294          content = Path(release_file).read_text()
295      except FileNotFoundError:
296          print(f"  [error] Release file not found: {release_file}", file=sys.stderr)
297          return set(), set(all_contributors)
298  
299      mentioned = set()
300      missing = set()
301      content_lower = content.lower()
302  
303      for handle in all_contributors:
304          # Check for @handle or just handle (case-insensitive)
305          if f"@{handle.lower()}" in content_lower or handle.lower() in content_lower:
306              mentioned.add(handle)
307          else:
308              missing.add(handle)
309  
310      return mentioned, missing
311  
312  
313  # ---------------------------------------------------------------------------
314  # Main
315  # ---------------------------------------------------------------------------
316  
317  def main():
318      parser = argparse.ArgumentParser(
319          description="Audit contributors across git history, co-author trailers, and salvaged PRs.",
320      )
321      parser.add_argument(
322          "--since-tag",
323          required=True,
324          help="Git tag to start from (e.g., v2026.4.8)",
325      )
326      parser.add_argument(
327          "--until",
328          default="HEAD",
329          help="Git ref to end at (default: HEAD)",
330      )
331      parser.add_argument(
332          "--release-file",
333          default=None,
334          help="Path to a release notes file to check for missing contributors",
335      )
336      parser.add_argument(
337          "--strict",
338          action="store_true",
339          help="Exit with code 1 if new unmapped emails are found (for CI)",
340      )
341      parser.add_argument(
342          "--diff-base",
343          default=None,
344          help="Git ref to diff against (only flag emails from commits after this ref)",
345      )
346      args = parser.parse_args()
347  
348      print(f"=== Contributor Audit: {args.since_tag}..{args.until} ===")
349      print()
350  
351      # ---- 1. Git commit authors ----
352      print("[1/3] Scanning git commit authors...")
353      commit_contribs, commit_unknowns = collect_commit_authors(args.since_tag, args.until)
354      print(f"      Found {len(commit_contribs)} contributor(s) from commits.")
355  
356      # ---- 2. Co-authored-by trailers ----
357      print("[2/3] Scanning Co-authored-by trailers...")
358      coauthor_contribs, coauthor_unknowns = collect_co_authors(args.since_tag, args.until)
359      print(f"      Found {len(coauthor_contribs)} contributor(s) from co-author trailers.")
360  
361      # ---- 3. Salvaged PRs ----
362      print("[3/3] Scanning salvaged/cherry-picked PR descriptions...")
363      salvage_contribs, salvage_pr_refs = collect_salvaged_contributors(args.since_tag, args.until)
364      print(f"      Found {len(salvage_contribs)} contributor(s) from salvaged PRs.")
365  
366      # ---- Merge all contributors ----
367      all_contributors = defaultdict(set)
368      for handle, sources in commit_contribs.items():
369          all_contributors[handle].update(sources)
370      for handle, sources in coauthor_contribs.items():
371          all_contributors[handle].update(sources)
372      for handle, sources in salvage_contribs.items():
373          all_contributors[handle].update(sources)
374  
375      # Merge unknown emails
376      all_unknowns = {}
377      all_unknowns.update(commit_unknowns)
378      all_unknowns.update(coauthor_unknowns)
379  
380      # Filter out AI assistants, bots, and machine accounts
381      ignored = {h for h in all_contributors if is_ignored(h)}
382      for h in ignored:
383          del all_contributors[h]
384      # Also filter unknowns by email
385      all_unknowns = {e: n for e, n in all_unknowns.items() if not is_ignored(n, e)}
386  
387      # ---- Output ----
388      print()
389      print(f"=== All Contributors ({len(all_contributors)}) ===")
390      print()
391  
392      # Sort by handle, case-insensitive
393      for handle in sorted(all_contributors.keys(), key=str.lower):
394          sources = sorted(all_contributors[handle])
395          source_str = ", ".join(sources)
396          extra = ""
397          if handle in salvage_pr_refs:
398              pr_nums = salvage_pr_refs[handle]
399              extra = f"  (PRs: {', '.join(f'#{n}' for n in pr_nums)})"
400          print(f"  @{handle}  [{source_str}]{extra}")
401  
402      # ---- Unknown emails ----
403      if all_unknowns:
404          print()
405          print(f"=== Unknown Emails ({len(all_unknowns)}) ===")
406          print("These emails are not in AUTHOR_MAP and should be added:")
407          print()
408          for email, name in sorted(all_unknowns.items()):
409              print(f'  "{email}": "{name}",')
410  
411      # ---- Strict mode: fail CI if new unmapped emails are introduced ----
412      if args.strict and all_unknowns:
413          # In strict mode, check if ANY unknown emails come from commits in this
414          # PR's diff range (new unmapped emails that weren't there before).
415          # This is the CI gate: existing unknowns are grandfathered, but new
416          # commits must have their author email in AUTHOR_MAP.
417          new_unknowns = {}
418          if args.diff_base:
419              # Only flag emails from commits after diff_base
420              new_commits_output = git(
421                  "log", f"{args.diff_base}..HEAD",
422                  "--format=%ae", "--no-merges",
423              )
424              new_emails = set(new_commits_output.splitlines()) if new_commits_output else set()
425              for email, name in all_unknowns.items():
426                  if email in new_emails:
427                      new_unknowns[email] = name
428          else:
429              new_unknowns = all_unknowns
430  
431          if new_unknowns:
432              print()
433              print(f"=== STRICT MODE FAILURE: {len(new_unknowns)} new unmapped email(s) ===")
434              print("Add these to AUTHOR_MAP in scripts/release.py before merging:")
435              print()
436              for email, name in sorted(new_unknowns.items()):
437                  print(f'    "{email}": "<github-username>",')
438              print()
439              print("To find the GitHub username:")
440              print("  gh api 'search/users?q=EMAIL+in:email' --jq '.items[0].login'")
441              strict_failed = True
442          else:
443              strict_failed = False
444      else:
445          strict_failed = False
446  
447      # ---- Release file comparison ----
448      if args.release_file:
449          print()
450          print(f"=== Release File Check: {args.release_file} ===")
451          print()
452          mentioned, missing = check_release_file(args.release_file, all_contributors.keys())
453          print(f"  Mentioned in release notes: {len(mentioned)}")
454          print(f"  Missing from release notes: {len(missing)}")
455          if missing:
456              print()
457              print("  Contributors NOT mentioned in the release file:")
458              for handle in sorted(missing, key=str.lower):
459                  sources = sorted(all_contributors[handle])
460                  print(f"    @{handle}  [{', '.join(sources)}]")
461          else:
462              print()
463              print("  All contributors are mentioned in the release file!")
464  
465      print()
466      print("Done.")
467  
468      if strict_failed:
469          sys.exit(1)
470  
471  
472  if __name__ == "__main__":
473      main()