/ list-pulls.py
list-pulls.py
1 #!/usr/bin/env python3 2 ''' 3 Script to parse git commit list, extract github issues to create a changelog in 4 text and JSON format. 5 6 Run this in the root directory of the repository. 7 8 This requires an up-to-date checkout of https://github.com/zw/bitcoin-gh-meta.git 9 in the parent directory, or environment variable `GHMETA`. 10 11 It takes a range of commits and a .json file of PRs to exclude, for 12 example if these are already backported in a minor release. This can be the pulls.json 13 generated from a previous release. 14 15 Example usage: 16 17 ../maintainer-tools/list-pulls.py v28.0 29 relnot/pulls-exclude.json > relnot/pulls.md 18 19 The output of this script is a first draft based on rough heuristics, and 20 likely needs to be extensively manually edited before ending up in the release 21 notes. 22 ''' 23 # W.J. van der Laan 2017-2021 24 # SPDX-License-Identifier: MIT 25 import subprocess 26 import re 27 import json 28 import time 29 import sys, os 30 from collections import namedtuple, defaultdict 31 32 # == Global environment == 33 GIT = os.getenv('GIT', 'git') 34 GHMETA = os.getenv('GHMETA', '../bitcoin-gh-meta') 35 DEFAULT_REPO = os.getenv('DEFAULT_REPO', 'bitcoin/bitcoin') 36 37 # == Label to category mapping == 38 # See: https://github.com/bitcoin/bitcoin/labels 39 # this is priority ordering: the first label to be matched determines the 40 # category it is slotted to 41 # TODO: simply create titles for combinations of mappings, and leave it up to release note writer 42 # which one to choose? this automatic choosing based on "priority" kind of sucks. 43 LABEL_MAPPING = ( 44 # Consensus, mining and policy changes should come first 45 ({'consensus'}, 46 'Consensus'), 47 ({'tx fees and policy'}, 48 'Policy'), 49 ({'mining'}, 50 'Mining'), 51 # Privacy changes 52 ({'privacy'}, 53 'Privacy'), 54 # Backends 55 ({'mempool', 'block storage', 'utxo db and indexes', 'validation'}, 56 'Block and transaction handling'), 57 ({'p2p'}, 58 'P2P protocol and network code'), 59 ({'wallet', 'descriptors'}, 60 'Wallet'), 61 # Frontends 62 ({'rpc/rest/zmq'}, 63 'RPC and other APIs'), 64 ({'gui'}, 65 'GUI'), 66 # Frameworks, infrastructure, building etcetera 67 ({'build system'}, 68 'Build system'), 69 ({'tests'}, 70 'Tests and QA'), 71 ({'utils/log/libs', 'scripts and tools', 'upstream', 'utils and libraries'}, 72 'Miscellaneous'), 73 # Documentation-only 74 ({'docs and output', 'docs'}, 75 'Documentation'), 76 ({'windows', 'unix', 'macos'}, 77 'Platform support'), 78 # Ignore everything below this for pull list 79 ({'refactoring'}, 80 'Refactoring'), # Ignore pure refactoring for pull list 81 ({'backport'}, 82 'Backports'), # Ignore pure backports for pull list 83 ) 84 UNCATEGORIZED = 'Uncategorized' 85 86 # == PR title prefix to category mapping == 87 # this takes precedence over the above label mapping 88 # handle (in all cases, ignoring including leading and trailing ' ') 89 # SPECIFY IN LOWERCASE 90 # set do_strip as False if the prefix adds information beyond what the category provides! 91 # '[prefix]:' '[prefix]' 'prefix:' 92 PREFIXES = [ 93 # (prefix, category, do_strip) 94 ('bench', 'Tests and QA', False), 95 ('build', 'Build system', True), 96 ('ci', 'Tests and QA', False), 97 ('cli', 'RPC and other APIs', False), 98 ('consensus', 'Consensus', True), 99 ('contrib', 'Miscellaneous', False), 100 ('depends', 'Build system', True), 101 ('doc', 'Documentation', True), 102 ('docs', 'Documentation', True), 103 ('gitian', 'Build system', False), 104 ('gui', 'GUI', True), 105 ('lint', 'Miscellaneous', False), 106 ('logging', 'Miscellaneous', False), 107 ('mempool', 'Block and transaction handling', True), 108 ('txmempool', 'Block and transaction handling', True), 109 ('moveonly', 'Refactoring', False), 110 ('net', 'P2P protocol and network code', True), 111 ('nit', 'Refactoring', True), 112 ('p2p', 'P2P protocol and network code', True), 113 ('policy', 'Policy', True), 114 ('qa', 'Tests and QA', True), 115 ('qt', 'GUI', True), 116 ('refactor', 'Refactoring', True), 117 ('release', 'Build system', False), 118 ('rest', 'RPC and other APIs', False), 119 ('rpc', 'RPC and other APIs', True), 120 ('scripted-diff', 'Refactoring', False), 121 ('script', 'Miscellaneous', False), # !!! this is unclear, 'script' could also be block/tx handling or even consensus 122 ('scripts', 'Miscellaneous', False), 123 ('shutdown', 'Miscellaneous', False), 124 ('tests', 'Tests and QA', True), 125 ('test', 'Tests and QA', True), 126 ('travis', 'Tests and QA', False), 127 ('trivial', 'Refactoring', True), 128 ('ui', 'GUI', True), 129 ('util', 'Miscellaneous', False), 130 ('utils', 'Miscellaneous', False), 131 ('validation', 'Block and transaction handling', True), 132 ('wallet', 'Wallet', True), 133 ] 134 135 # Per-repository information 136 REPO_INFO = { 137 'bitcoin/bitcoin': { 138 'label_mapping': LABEL_MAPPING, 139 'prefixes': PREFIXES, 140 'default_category': UNCATEGORIZED, 141 'ghmeta': GHMETA, 142 }, 143 # For now, GUI repository pulls are automatically categorized into the GUI category. 144 'bitcoin-core/gui': { 145 'label_mapping': (), 146 'prefixes': [], 147 'default_category': 'GUI', 148 'ghmeta': None, 149 }, 150 } 151 152 # == Utilities == 153 154 def remove_last_if_empty(l): 155 '''Remove empty last member of list''' 156 if l[-1]==b'' or l[-1]=='': 157 return l[0:-1] 158 else: 159 return l 160 161 # Valid chars in github names 162 VALIDNAMECHARS = '[0-9a-zA-Z\-_]' 163 # For parsing owner/repo#id 164 FQID_RE = re.compile('^(' + VALIDNAMECHARS + '+)/(' + VALIDNAMECHARS + '+)#([0-9]+)$') 165 # For parsing non-qualified #id 166 PR_RE = re.compile('^#?([0-9]+)$') 167 168 class FQId: 169 '''Fully qualified PR id.''' 170 def __init__(self, owner: str, repo: str, pr: int): 171 self.owner = owner 172 self.repo = repo 173 self.pr = pr 174 175 @property 176 def _key(self): 177 return (self.owner, self.repo, self.pr) 178 179 def __eq__(self, o): 180 return self._key == o._key 181 182 def __lt__(self, o): 183 return self._key < o._key 184 185 def __hash__(self): 186 return hash(self._key) 187 188 def __str__(self): 189 return f'{self.owner}/{self.repo}#{self.pr}' 190 191 def __repr__(self): 192 return f'FQId({repr(self.owner)}, {repr(self.repo)}, {repr(self.pr)})' 193 194 @classmethod 195 def parse(cls, pull, default_repo): 196 '''Return FQId from 'owner/repo#id' or '#id' or 'id' string.''' 197 m = FQID_RE.match(pull) 198 if m: 199 return cls(m.group(1), m.group(2), int(m.group(3))) 200 m = PR_RE.match(pull) 201 if m: 202 (owner, repo) = default_repo.split('/') 203 return cls(owner, repo, int(m.group(1))) 204 raise ValueError(f'Cannot parse {pull} as PR specification.') 205 206 def tests(): 207 '''Quick internal sanity tests.''' 208 assert(FQId.parse('bitcoin/bitcoin#1234', 'bitcoin/bitcoin') == FQId('bitcoin', 'bitcoin', 1234)) 209 assert(FQId.parse('bitcoin-core/gui#1235', 'bitcoin/bitcoin') == FQId('bitcoin-core', 'gui', 1235)) 210 assert(FQId.parse('#1236', 'bitcoin/bitcoin') == FQId('bitcoin', 'bitcoin', 1236)) 211 assert(FQId.parse('1237', 'bitcoin/bitcoin') == FQId('bitcoin', 'bitcoin', 1237)) 212 assert(str(FQId('bitcoin', 'bitcoin', 1239)) == 'bitcoin/bitcoin#1239') 213 assert(FQId('bitcoin', 'bitcoin', 1239) < FQId('bitcoin', 'bitcoin', 1240)) 214 assert(not (FQId('bitcoin', 'bitcoin', 1240) < FQId('bitcoin', 'bitcoin', 1239))) 215 assert(FQId('bitcoin', 'bitcoin', 1240) < FQId('bitcoin-core', 'gui', 1239)) 216 assert(not (FQId('bitcoin-core', 'gui', 1239) < FQId('bitcoin', 'bitcoin', 1240))) 217 218 # == Main program == 219 tests() 220 ref_from = sys.argv[1] # 'v29.1rc1' 221 ref_to = sys.argv[2] # 'master' 222 223 # read exclude file 224 exclude_pulls = set() 225 226 if len(sys.argv) >= 4: 227 exclude_file = sys.argv[3] 228 try: 229 with open(exclude_file, 'r') as f: 230 d = json.load(f) 231 exclude_pulls = set(FQId.parse(str(p['id']), DEFAULT_REPO) for p in d['pulls']) 232 print(f'Excluding {", ".join(str(p) for p in exclude_pulls)}') 233 print() 234 except IOError as e: 235 print(f'Unable to read exclude file {exclude_file}', file=sys.stderr) 236 exit(1) 237 238 # set of all commits 239 commits = subprocess.check_output([GIT, 'rev-list', '--reverse', '--topo-order', ref_from+'..'+ref_to]) 240 commits = commits.decode() 241 commits = remove_last_if_empty(commits.splitlines()) 242 commits_list = commits 243 commits = set(commits) 244 245 CommitData = namedtuple('CommitData', ['sha', 'message', 'title', 'parents']) 246 commit_data = {} 247 248 # collect data 249 for commit in commits: 250 info = subprocess.check_output([GIT, 'show', '-s', '--format=%B%x00%P', commit]) 251 info = info.decode() 252 (message, parents) = info.split('\0') 253 title = message.rstrip().splitlines()[0] 254 parents = parents.rstrip().split(' ') 255 commit_data[commit] = CommitData(commit, message, title, parents) 256 257 class CommitMetaData: 258 pull = None 259 rebased_from = None 260 261 def __repr__(self): 262 return 'CommitMetadata(pull=%s,rebased_from=%s)' % (self.pull,self.rebased_from) 263 264 def parse_commit_message(msg): 265 ''' 266 Parse backport commit message. 267 ''' 268 retval = CommitMetaData() 269 for line in msg.splitlines(): 270 if line.startswith('Github-Pull:'): 271 param = line[12:].strip() 272 if param.startswith('#'): # compensate for incorrect #bitcoin-core/gui#148 273 param = param[1:] 274 retval.pull = FQId.parse(param, DEFAULT_REPO) 275 if line.startswith('Rebased-From:'): 276 retval.rebased_from = line[13:].strip().split() 277 if retval.pull is not None: 278 return retval 279 else: 280 return None 281 282 # traverse merge commits 283 pulls = {} 284 PullData = namedtuple('PullData', ['id', 'merge', 'commits', 'index']) 285 orphans = set(commits) 286 MERGE_RE = re.compile('Merge (.*?):') 287 for c in commit_data.values(): 288 # is merge commit 289 if len(c.parents)>1: 290 assert(len(c.parents)==2) 291 match = MERGE_RE.match(c.title) 292 if match: # merges a pull request 293 if c.sha in orphans: 294 orphans.remove(c.sha) 295 #print('removing ', c.sha) 296 sub_commits = subprocess.check_output([GIT, 'rev-list', c.parents[0]+'..'+c.parents[1]]) 297 sub_commits = sub_commits.decode() 298 sub_commits = set(sub_commits.rstrip().splitlines()) 299 pull = FQId.parse(match.group(1), DEFAULT_REPO) 300 301 # remove commits that are not in the global list 302 sub_commits = sub_commits.intersection(commits) 303 for cs in sub_commits: 304 if cs in orphans: 305 orphans.remove(cs) 306 307 if not pull in exclude_pulls: 308 # if any sub-commits left, report them 309 if sub_commits: 310 # only report pull if any new commit went into the release 311 index = commits_list.index(c.sha) 312 pulls[pull] = PullData(pull, c.sha, sub_commits, index) 313 314 # look up commits and see if they point to master pulls 315 # (=backport pull) 316 # add those too 317 sub_pulls = defaultdict(list) 318 for cid in sub_commits: 319 md = parse_commit_message(commit_data[cid].message) 320 if md: 321 sub_pulls[md.pull].append(cid) 322 323 if not sub_pulls and 'backport' in c.title.lower(): 324 # just information for manual checking 325 print(f'{pull}: Merge PR title {repr(c.title)} contains \'backport\' but there are no sub-pulls') 326 327 for (sub_pull, sub_pull_commits) in sub_pulls.items(): 328 pulls[sub_pull] = PullData(sub_pull, sub_pull_commits[0], sub_pull_commits, index) 329 else: 330 print(f'{c.sha}: Merge commit does not merge a PR: {c.title}') 331 332 # Extract remaining pull numbers from orphans, if they're backports 333 for o in set(orphans): 334 c = commit_data[o] 335 md = parse_commit_message(commit_data[o].message) 336 if md: 337 pulls[md.pull] = PullData(md.pull, c.sha, [], commits_list.index(c.sha)) 338 orphans.remove(o) 339 340 # Sort by index in commits list 341 # This results in approximately chronological order 342 pulls_order = list(pulls.values()) 343 pulls_order.sort(key=lambda p:p.index) 344 pulls_order = [p.id for p in pulls_order] 345 # pulls_order = sorted(pulls.keys()) 346 347 def guess_category_from_labels(repo_info, labels): 348 ''' 349 Guess category for a PR from github labels. 350 ''' 351 labels = [l.lower() for l in labels] 352 for (label_list, category) in repo_info['label_mapping']: 353 for l in labels: 354 if l in label_list: 355 return category 356 return repo_info['default_category'] 357 358 def get_category(repo_info, labels, message): 359 ''' 360 Guess category for a PR from repository, labels and message prefixes. 361 Strip category from message. 362 ''' 363 category = guess_category_from_labels(repo_info, labels) 364 message = message.strip() 365 366 for (prefix, p_category, do_strip) in repo_info['prefixes']: 367 for variant in [('[' + prefix + ']:'), ('[' + prefix + ']'), (prefix + ':')]: 368 if message.lower().startswith(variant): 369 category = p_category 370 message = message[len(variant):].lstrip() 371 if not do_strip: # if strip is not requested, re-add prefix in sanitized way 372 message = prefix + ': ' + message.capitalize() 373 374 return (category, message) 375 376 pull_meta = {} 377 pull_labels = {} 378 per_category = defaultdict(list) 379 for pull in pulls_order: 380 repo_info = REPO_INFO[f'{pull.owner}/{pull.repo}'] 381 382 # Find github metadata for PR, if available 383 data0 = None 384 data1 = {'title': '{Not found}', 'user': {'login':'unknown'}} 385 if repo_info['ghmeta'] is not None: 386 filename = f'{repo_info["ghmeta"]}/issues/{pull.pr//100}xx/{pull.pr}.json' 387 try: 388 with open(filename, 'r') as f: 389 data0 = json.load(f) 390 except IOError as e: 391 pass 392 393 filename = f'{repo_info["ghmeta"]}/issues/{pull.pr//100}xx/{pull.pr}-PR.json' 394 try: 395 with open(filename, 'r') as f: 396 data1 = json.load(f) 397 except IOError as e: 398 pass 399 400 message = data1['title'] 401 author = data1['user']['login'] 402 if data0 is not None: 403 labels = [l['name'] for l in data0['labels']] 404 else: 405 labels = ['Missing'] 406 407 # nightmarish UTF tweaking to fix broken output of export script 408 message = message.encode('ISO-8859-1', errors='replace').decode(errors='replace') 409 410 # consistent ellipsis 411 message = message.replace('...', '…') 412 # no '.' at end 413 if message.endswith('.'): 414 message = message[0:-1] 415 416 # determine category and new message from message 417 category, message = get_category(repo_info, labels, message) 418 data1['title'] = message 419 420 per_category[category].append((pull, message, author)) 421 pull_labels[pull] = labels 422 pull_meta[pull] = data1 423 424 for _,category in LABEL_MAPPING: 425 if not per_category[category]: 426 continue 427 print('### %s' % category) 428 for dd in per_category[category]: 429 print(f'- {dd[0]} {dd[1]} ({dd[2]})') 430 print() 431 432 if per_category[UNCATEGORIZED]: 433 print('### %s' % UNCATEGORIZED) 434 for dd in per_category[UNCATEGORIZED]: 435 print(f'- {dd[0]} {dd[1]} ({dd[2]}) (labels: {pull_labels[dd[0]]})') 436 print() 437 438 print('### Orphan commits') 439 for o in orphans: 440 c = commit_data[o] 441 print('- `%s` %s' % (o[0:7], c.title)) 442 443 # write to json structure for postprocessing 444 commits_d = [] 445 for c in commits_list: 446 commits_d.append(commit_data[c]) 447 448 pulls_d = [] 449 for pull in sorted(pulls.keys()): 450 pd = pulls[pull] 451 pulls_d.append( 452 {'id': str(pd.id), 453 'merge': pd.merge, 454 'commits': list(pd.commits), 455 'meta': pull_meta[pd.id]}) 456 457 data_out = { 458 'commits': commits_d, 459 'pulls': pulls_d, 460 'orphans': list(orphans), 461 } 462 463 with open('pulls.json','w') as f: 464 json.dump(data_out, f, sort_keys=True, 465 indent=4, separators=(',', ': ')) 466