contributor_audit.py
1 #!/usr/bin/env python3 2 """Contributor Audit Script 3 4 Cross-references git authors, Co-authored-by trailers, and salvaged PR 5 descriptions to find any contributors missing from the release notes. 6 7 Usage: 8 # Basic audit since a tag 9 python scripts/contributor_audit.py --since-tag v2026.4.8 10 11 # Audit with a custom endpoint 12 python scripts/contributor_audit.py --since-tag v2026.4.8 --until v2026.4.13 13 14 # Compare against a release notes file 15 python scripts/contributor_audit.py --since-tag v2026.4.8 --release-file RELEASE_v0.9.0.md 16 """ 17 18 import argparse 19 import json 20 import os 21 import re 22 import subprocess 23 import sys 24 from collections import defaultdict 25 from pathlib import Path 26 27 # --------------------------------------------------------------------------- 28 # Import AUTHOR_MAP and resolve_author from the sibling release.py module 29 # --------------------------------------------------------------------------- 30 SCRIPT_DIR = Path(__file__).resolve().parent 31 sys.path.insert(0, str(SCRIPT_DIR)) 32 33 from release import AUTHOR_MAP, resolve_author # noqa: E402 34 35 REPO_ROOT = SCRIPT_DIR.parent 36 37 # --------------------------------------------------------------------------- 38 # AI assistants, bots, and machine accounts to exclude from contributor lists 39 # --------------------------------------------------------------------------- 40 IGNORED_PATTERNS = [ 41 re.compile(r"^Claude", re.IGNORECASE), 42 re.compile(r"^Copilot$", re.IGNORECASE), 43 re.compile(r"^Cursor\s+Agent$", re.IGNORECASE), 44 re.compile(r"^GitHub\s*Actions?$", re.IGNORECASE), 45 re.compile(r"^dependabot", re.IGNORECASE), 46 re.compile(r"^renovate", re.IGNORECASE), 47 re.compile(r"^Hermes\s+(Agent|Audit)$", re.IGNORECASE), 48 re.compile(r"^Ubuntu$", re.IGNORECASE), 49 ] 50 51 IGNORED_EMAILS = { 52 "noreply@anthropic.com", 53 "noreply@github.com", 54 "cursoragent@cursor.com", 55 "hermes@nousresearch.com", 56 "hermes-audit@example.com", 57 "hermes@habibilabs.dev", 58 } 59 60 61 def is_ignored(handle: str, email: str = "") -> bool: 62 """Return True if this contributor is a bot/AI/machine account.""" 63 if email in IGNORED_EMAILS: 64 return True 65 for pattern in IGNORED_PATTERNS: 66 if pattern.search(handle): 67 return True 68 return False 69 70 71 # --------------------------------------------------------------------------- 72 # Helpers 73 # --------------------------------------------------------------------------- 74 75 def git(*args, cwd=None): 76 """Run a git command and return stdout.""" 77 result = subprocess.run( 78 ["git"] + list(args), 79 capture_output=True, 80 text=True, 81 cwd=cwd or str(REPO_ROOT), 82 ) 83 if result.returncode != 0: 84 print(f" [warn] git {' '.join(args)} failed: {result.stderr.strip()}", file=sys.stderr) 85 return "" 86 return result.stdout.strip() 87 88 89 def gh_pr_list(): 90 """Fetch merged PRs from GitHub using the gh CLI. 91 92 Returns a list of dicts with keys: number, title, body, author. 93 Returns an empty list if gh is not available or the call fails. 94 """ 95 try: 96 result = subprocess.run( 97 [ 98 "gh", "pr", "list", 99 "--repo", "NousResearch/hermes-agent", 100 "--state", "merged", 101 "--json", "number,title,body,author,mergedAt", 102 "--limit", "300", 103 ], 104 capture_output=True, 105 text=True, 106 timeout=60, 107 ) 108 if result.returncode != 0: 109 print(f" [warn] gh pr list failed: {result.stderr.strip()}", file=sys.stderr) 110 return [] 111 return json.loads(result.stdout) 112 except FileNotFoundError: 113 print(" [warn] 'gh' CLI not found — skipping salvaged PR scan.", file=sys.stderr) 114 return [] 115 except subprocess.TimeoutExpired: 116 print(" [warn] gh pr list timed out — skipping salvaged PR scan.", file=sys.stderr) 117 return [] 118 except json.JSONDecodeError: 119 print(" [warn] gh pr list returned invalid JSON — skipping salvaged PR scan.", file=sys.stderr) 120 return [] 121 122 123 # --------------------------------------------------------------------------- 124 # Contributor collection 125 # --------------------------------------------------------------------------- 126 127 # Patterns that indicate salvaged/cherry-picked/co-authored work in PR bodies 128 SALVAGE_PATTERNS = [ 129 # "Salvaged from @username" or "Salvaged from #123" 130 re.compile(r"[Ss]alvaged\s+from\s+@(\w[\w-]*)"), 131 re.compile(r"[Ss]alvaged\s+from\s+#(\d+)"), 132 # "Cherry-picked from @username" 133 re.compile(r"[Cc]herry[- ]?picked\s+from\s+@(\w[\w-]*)"), 134 # "Based on work by @username" 135 re.compile(r"[Bb]ased\s+on\s+work\s+by\s+@(\w[\w-]*)"), 136 # "Original PR by @username" 137 re.compile(r"[Oo]riginal\s+PR\s+by\s+@(\w[\w-]*)"), 138 # "Co-authored with @username" 139 re.compile(r"[Cc]o[- ]?authored\s+with\s+@(\w[\w-]*)"), 140 ] 141 142 # Pattern for Co-authored-by trailers in commit messages 143 CO_AUTHORED_RE = re.compile( 144 r"Co-authored-by:\s*(.+?)\s*<([^>]+)>", 145 re.IGNORECASE, 146 ) 147 148 149 def collect_commit_authors(since_tag, until="HEAD"): 150 """Collect contributors from git commit authors. 151 152 Returns: 153 contributors: dict mapping github_handle -> set of source labels 154 unknown_emails: dict mapping email -> git name (for emails not in AUTHOR_MAP) 155 """ 156 range_spec = f"{since_tag}..{until}" 157 log = git( 158 "log", range_spec, 159 "--format=%H|%an|%ae|%s", 160 "--no-merges", 161 ) 162 163 contributors = defaultdict(set) 164 unknown_emails = {} 165 166 if not log: 167 return contributors, unknown_emails 168 169 for line in log.split("\n"): 170 if not line.strip(): 171 continue 172 parts = line.split("|", 3) 173 if len(parts) != 4: 174 continue 175 _sha, name, email, _subject = parts 176 177 handle = resolve_author(name, email) 178 # resolve_author returns "@handle" or plain name 179 if handle.startswith("@"): 180 contributors[handle.lstrip("@")].add("commit") 181 else: 182 # Could not resolve — record as unknown 183 contributors[handle].add("commit") 184 unknown_emails[email] = name 185 186 return contributors, unknown_emails 187 188 189 def collect_co_authors(since_tag, until="HEAD"): 190 """Collect contributors from Co-authored-by trailers in commit messages. 191 192 Returns: 193 contributors: dict mapping github_handle -> set of source labels 194 unknown_emails: dict mapping email -> git name 195 """ 196 range_spec = f"{since_tag}..{until}" 197 # Get full commit messages to scan for trailers 198 log = git( 199 "log", range_spec, 200 "--format=__COMMIT__%H%n%b", 201 "--no-merges", 202 ) 203 204 contributors = defaultdict(set) 205 unknown_emails = {} 206 207 if not log: 208 return contributors, unknown_emails 209 210 for line in log.split("\n"): 211 match = CO_AUTHORED_RE.search(line) 212 if match: 213 name = match.group(1).strip() 214 email = match.group(2).strip() 215 handle = resolve_author(name, email) 216 if handle.startswith("@"): 217 contributors[handle.lstrip("@")].add("co-author") 218 else: 219 contributors[handle].add("co-author") 220 unknown_emails[email] = name 221 222 return contributors, unknown_emails 223 224 225 def collect_salvaged_contributors(since_tag, until="HEAD"): 226 """Scan merged PR bodies for salvage/cherry-pick/co-author attribution. 227 228 Uses the gh CLI to fetch PRs, then filters to the date range defined 229 by since_tag..until and scans bodies for salvage patterns. 230 231 Returns: 232 contributors: dict mapping github_handle -> set of source labels 233 pr_refs: dict mapping github_handle -> list of PR numbers where found 234 """ 235 contributors = defaultdict(set) 236 pr_refs = defaultdict(list) 237 238 # Determine the date range from git tags/refs 239 since_date = git("log", "-1", "--format=%aI", since_tag) 240 if until == "HEAD": 241 until_date = git("log", "-1", "--format=%aI", "HEAD") 242 else: 243 until_date = git("log", "-1", "--format=%aI", until) 244 245 if not since_date: 246 print(f" [warn] Could not resolve date for {since_tag}", file=sys.stderr) 247 return contributors, pr_refs 248 249 prs = gh_pr_list() 250 if not prs: 251 return contributors, pr_refs 252 253 for pr in prs: 254 # Filter by merge date if available 255 merged_at = pr.get("mergedAt", "") 256 if merged_at and since_date: 257 if merged_at < since_date: 258 continue 259 if until_date and merged_at > until_date: 260 continue 261 262 body = pr.get("body") or "" 263 pr_number = pr.get("number", "?") 264 265 # Also credit the PR author 266 pr_author = pr.get("author", {}) 267 pr_author_login = pr_author.get("login", "") if isinstance(pr_author, dict) else "" 268 269 for pattern in SALVAGE_PATTERNS: 270 for match in pattern.finditer(body): 271 value = match.group(1) 272 # If it's a number, it's a PR reference — skip for now 273 # (would need another API call to resolve PR author) 274 if value.isdigit(): 275 continue 276 contributors[value].add("salvage") 277 pr_refs[value].append(pr_number) 278 279 return contributors, pr_refs 280 281 282 # --------------------------------------------------------------------------- 283 # Release file comparison 284 # --------------------------------------------------------------------------- 285 286 def check_release_file(release_file, all_contributors): 287 """Check which contributors are mentioned in the release file. 288 289 Returns: 290 mentioned: set of handles found in the file 291 missing: set of handles NOT found in the file 292 """ 293 try: 294 content = Path(release_file).read_text() 295 except FileNotFoundError: 296 print(f" [error] Release file not found: {release_file}", file=sys.stderr) 297 return set(), set(all_contributors) 298 299 mentioned = set() 300 missing = set() 301 content_lower = content.lower() 302 303 for handle in all_contributors: 304 # Check for @handle or just handle (case-insensitive) 305 if f"@{handle.lower()}" in content_lower or handle.lower() in content_lower: 306 mentioned.add(handle) 307 else: 308 missing.add(handle) 309 310 return mentioned, missing 311 312 313 # --------------------------------------------------------------------------- 314 # Main 315 # --------------------------------------------------------------------------- 316 317 def main(): 318 parser = argparse.ArgumentParser( 319 description="Audit contributors across git history, co-author trailers, and salvaged PRs.", 320 ) 321 parser.add_argument( 322 "--since-tag", 323 required=True, 324 help="Git tag to start from (e.g., v2026.4.8)", 325 ) 326 parser.add_argument( 327 "--until", 328 default="HEAD", 329 help="Git ref to end at (default: HEAD)", 330 ) 331 parser.add_argument( 332 "--release-file", 333 default=None, 334 help="Path to a release notes file to check for missing contributors", 335 ) 336 parser.add_argument( 337 "--strict", 338 action="store_true", 339 help="Exit with code 1 if new unmapped emails are found (for CI)", 340 ) 341 parser.add_argument( 342 "--diff-base", 343 default=None, 344 help="Git ref to diff against (only flag emails from commits after this ref)", 345 ) 346 args = parser.parse_args() 347 348 print(f"=== Contributor Audit: {args.since_tag}..{args.until} ===") 349 print() 350 351 # ---- 1. Git commit authors ---- 352 print("[1/3] Scanning git commit authors...") 353 commit_contribs, commit_unknowns = collect_commit_authors(args.since_tag, args.until) 354 print(f" Found {len(commit_contribs)} contributor(s) from commits.") 355 356 # ---- 2. Co-authored-by trailers ---- 357 print("[2/3] Scanning Co-authored-by trailers...") 358 coauthor_contribs, coauthor_unknowns = collect_co_authors(args.since_tag, args.until) 359 print(f" Found {len(coauthor_contribs)} contributor(s) from co-author trailers.") 360 361 # ---- 3. Salvaged PRs ---- 362 print("[3/3] Scanning salvaged/cherry-picked PR descriptions...") 363 salvage_contribs, salvage_pr_refs = collect_salvaged_contributors(args.since_tag, args.until) 364 print(f" Found {len(salvage_contribs)} contributor(s) from salvaged PRs.") 365 366 # ---- Merge all contributors ---- 367 all_contributors = defaultdict(set) 368 for handle, sources in commit_contribs.items(): 369 all_contributors[handle].update(sources) 370 for handle, sources in coauthor_contribs.items(): 371 all_contributors[handle].update(sources) 372 for handle, sources in salvage_contribs.items(): 373 all_contributors[handle].update(sources) 374 375 # Merge unknown emails 376 all_unknowns = {} 377 all_unknowns.update(commit_unknowns) 378 all_unknowns.update(coauthor_unknowns) 379 380 # Filter out AI assistants, bots, and machine accounts 381 ignored = {h for h in all_contributors if is_ignored(h)} 382 for h in ignored: 383 del all_contributors[h] 384 # Also filter unknowns by email 385 all_unknowns = {e: n for e, n in all_unknowns.items() if not is_ignored(n, e)} 386 387 # ---- Output ---- 388 print() 389 print(f"=== All Contributors ({len(all_contributors)}) ===") 390 print() 391 392 # Sort by handle, case-insensitive 393 for handle in sorted(all_contributors.keys(), key=str.lower): 394 sources = sorted(all_contributors[handle]) 395 source_str = ", ".join(sources) 396 extra = "" 397 if handle in salvage_pr_refs: 398 pr_nums = salvage_pr_refs[handle] 399 extra = f" (PRs: {', '.join(f'#{n}' for n in pr_nums)})" 400 print(f" @{handle} [{source_str}]{extra}") 401 402 # ---- Unknown emails ---- 403 if all_unknowns: 404 print() 405 print(f"=== Unknown Emails ({len(all_unknowns)}) ===") 406 print("These emails are not in AUTHOR_MAP and should be added:") 407 print() 408 for email, name in sorted(all_unknowns.items()): 409 print(f' "{email}": "{name}",') 410 411 # ---- Strict mode: fail CI if new unmapped emails are introduced ---- 412 if args.strict and all_unknowns: 413 # In strict mode, check if ANY unknown emails come from commits in this 414 # PR's diff range (new unmapped emails that weren't there before). 415 # This is the CI gate: existing unknowns are grandfathered, but new 416 # commits must have their author email in AUTHOR_MAP. 417 new_unknowns = {} 418 if args.diff_base: 419 # Only flag emails from commits after diff_base 420 new_commits_output = git( 421 "log", f"{args.diff_base}..HEAD", 422 "--format=%ae", "--no-merges", 423 ) 424 new_emails = set(new_commits_output.splitlines()) if new_commits_output else set() 425 for email, name in all_unknowns.items(): 426 if email in new_emails: 427 new_unknowns[email] = name 428 else: 429 new_unknowns = all_unknowns 430 431 if new_unknowns: 432 print() 433 print(f"=== STRICT MODE FAILURE: {len(new_unknowns)} new unmapped email(s) ===") 434 print("Add these to AUTHOR_MAP in scripts/release.py before merging:") 435 print() 436 for email, name in sorted(new_unknowns.items()): 437 print(f' "{email}": "<github-username>",') 438 print() 439 print("To find the GitHub username:") 440 print(" gh api 'search/users?q=EMAIL+in:email' --jq '.items[0].login'") 441 strict_failed = True 442 else: 443 strict_failed = False 444 else: 445 strict_failed = False 446 447 # ---- Release file comparison ---- 448 if args.release_file: 449 print() 450 print(f"=== Release File Check: {args.release_file} ===") 451 print() 452 mentioned, missing = check_release_file(args.release_file, all_contributors.keys()) 453 print(f" Mentioned in release notes: {len(mentioned)}") 454 print(f" Missing from release notes: {len(missing)}") 455 if missing: 456 print() 457 print(" Contributors NOT mentioned in the release file:") 458 for handle in sorted(missing, key=str.lower): 459 sources = sorted(all_contributors[handle]) 460 print(f" @{handle} [{', '.join(sources)}]") 461 else: 462 print() 463 print(" All contributors are mentioned in the release file!") 464 465 print() 466 print("Done.") 467 468 if strict_failed: 469 sys.exit(1) 470 471 472 if __name__ == "__main__": 473 main()