/ list-pulls.py
list-pulls.py
  1  #!/usr/bin/env python3
  2  '''
  3  Script to parse git commit list, extract github issues to create a changelog in
  4  text and JSON format.
  5  
  6  Run this in the root directory of the repository.
  7  
  8  This requires an up-to-date checkout of https://github.com/zw/bitcoin-gh-meta.git
  9  in the parent directory, or environment variable `GHMETA`.
 10  
 11  It takes a range of commits and a .json file of PRs to exclude, for
 12  example if these are already backported in a minor release. This can be the pulls.json
 13  generated from a previous release.
 14  
 15  Example usage:
 16  
 17      ../maintainer-tools/list-pulls.py v28.0 29 relnot/pulls-exclude.json > relnot/pulls.md
 18  
 19  The output of this script is a first draft based on rough heuristics, and
 20  likely needs to be extensively manually edited before ending up in the release
 21  notes.
 22  '''
 23  # W.J. van der Laan 2017-2021
 24  # SPDX-License-Identifier: MIT
 25  import subprocess
 26  import re
 27  import json
 28  import time
 29  import sys, os
 30  from collections import namedtuple, defaultdict
 31  
 32  # == Global environment ==
 33  GIT = os.getenv('GIT', 'git')
 34  GHMETA = os.getenv('GHMETA', '../bitcoin-gh-meta')
 35  DEFAULT_REPO = os.getenv('DEFAULT_REPO', 'bitcoin/bitcoin')
 36  
 37  # == Label to category mapping ==
 38  # See: https://github.com/bitcoin/bitcoin/labels
 39  # this is priority ordering: the first label to be matched determines the
 40  # category it is slotted to
 41  # TODO: simply create titles for combinations of mappings, and leave it up to release note writer
 42  # which one to choose? this automatic choosing based on "priority" kind of sucks.
 43  LABEL_MAPPING = (
 44      # Consensus, mining and policy changes should come first
 45      ({'consensus'},
 46          'Consensus'),
 47      ({'tx fees and policy'},
 48          'Policy'),
 49      ({'mining'},
 50          'Mining'),
 51      # Privacy changes
 52      ({'privacy'},
 53          'Privacy'),
 54      # Backends
 55      ({'mempool', 'block storage', 'utxo db and indexes', 'validation'},
 56          'Block and transaction handling'),
 57      ({'p2p'},
 58          'P2P protocol and network code'),
 59      ({'wallet', 'descriptors'},
 60          'Wallet'),
 61      # Frontends
 62      ({'rpc/rest/zmq'},
 63          'RPC and other APIs'),
 64      ({'gui'},
 65          'GUI'),
 66      # Frameworks, infrastructure, building etcetera
 67      ({'build system'},
 68          'Build system'),
 69      ({'tests'},
 70          'Tests and QA'),
 71      ({'utils/log/libs', 'scripts and tools', 'upstream', 'utils and libraries'},
 72          'Miscellaneous'),
 73      # Documentation-only
 74      ({'docs and output', 'docs'},
 75          'Documentation'),
 76      ({'windows', 'unix', 'macos'},
 77          'Platform support'),
 78      # Ignore everything below this for pull list
 79      ({'refactoring'},
 80          'Refactoring'),  # Ignore pure refactoring for pull list
 81      ({'backport'},
 82          'Backports'),  # Ignore pure backports for pull list
 83  )
 84  UNCATEGORIZED = 'Uncategorized'
 85  
 86  # == PR title prefix to category mapping ==
 87  # this takes precedence over the above label mapping 
 88  # handle (in all cases, ignoring including leading and trailing ' ')
 89  # SPECIFY IN LOWERCASE
 90  # set do_strip as False if the prefix adds information beyond what the category provides!
 91  # '[prefix]:' '[prefix]' 'prefix:'
 92  PREFIXES = [ 
 93      # (prefix, category, do_strip)
 94      ('bench', 'Tests and QA', False),
 95      ('build', 'Build system', True),
 96      ('ci', 'Tests and QA', False),
 97      ('cli', 'RPC and other APIs', False),
 98      ('consensus', 'Consensus', True),
 99      ('contrib', 'Miscellaneous', False),
100      ('depends', 'Build system', True),
101      ('doc', 'Documentation', True),
102      ('docs', 'Documentation', True),
103      ('gitian', 'Build system', False),
104      ('gui', 'GUI', True),
105      ('lint', 'Miscellaneous', False),
106      ('logging', 'Miscellaneous', False),
107      ('mempool', 'Block and transaction handling', True),
108      ('txmempool', 'Block and transaction handling', True),
109      ('moveonly', 'Refactoring', False),
110      ('net', 'P2P protocol and network code', True),
111      ('nit', 'Refactoring', True),
112      ('p2p', 'P2P protocol and network code', True),
113      ('policy', 'Policy', True),
114      ('qa', 'Tests and QA', True),
115      ('qt', 'GUI', True),
116      ('refactor', 'Refactoring', True),
117      ('release', 'Build system', False),
118      ('rest', 'RPC and other APIs', False),
119      ('rpc', 'RPC and other APIs', True),
120      ('scripted-diff', 'Refactoring', False),
121      ('script', 'Miscellaneous', False), # !!! this is unclear, 'script' could also be block/tx handling or even consensus
122      ('scripts', 'Miscellaneous', False),
123      ('shutdown', 'Miscellaneous', False),
124      ('tests', 'Tests and QA', True),
125      ('test', 'Tests and QA', True),
126      ('travis', 'Tests and QA', False),
127      ('trivial', 'Refactoring', True),
128      ('ui', 'GUI', True),
129      ('util', 'Miscellaneous', False),
130      ('utils', 'Miscellaneous', False),
131      ('validation', 'Block and transaction handling', True),
132      ('wallet', 'Wallet', True),
133  ]
134  
135  # Per-repository information
136  REPO_INFO = {
137      'bitcoin/bitcoin': {
138          'label_mapping': LABEL_MAPPING,
139          'prefixes': PREFIXES,
140          'default_category': UNCATEGORIZED,
141          'ghmeta': GHMETA,
142      },
143      # For now, GUI repository pulls are automatically categorized into the GUI category.
144      'bitcoin-core/gui': {
145          'label_mapping': (),
146          'prefixes': [],
147          'default_category': 'GUI',
148          'ghmeta': None,
149      },
150  }
151  
152  # == Utilities ==
153  
154  def remove_last_if_empty(l):
155      '''Remove empty last member of list'''
156      if l[-1]==b'' or l[-1]=='':
157          return l[0:-1]
158      else:
159          return l
160  
161  # Valid chars in github names
162  VALIDNAMECHARS = '[0-9a-zA-Z\-_]'
163  # For parsing owner/repo#id
164  FQID_RE = re.compile('^(' + VALIDNAMECHARS + '+)/(' + VALIDNAMECHARS + '+)#([0-9]+)$')
165  # For parsing non-qualified #id
166  PR_RE = re.compile('^#?([0-9]+)$')
167  
168  class FQId:
169      '''Fully qualified PR id.'''
170      def __init__(self, owner: str, repo: str, pr: int):
171          self.owner = owner
172          self.repo = repo
173          self.pr = pr
174  
175      @property
176      def _key(self):
177          return (self.owner, self.repo, self.pr)
178  
179      def __eq__(self, o):
180          return self._key == o._key
181  
182      def __lt__(self, o):
183          return self._key < o._key
184  
185      def __hash__(self):
186          return hash(self._key)
187  
188      def __str__(self):
189          return f'{self.owner}/{self.repo}#{self.pr}'
190  
191      def __repr__(self):
192          return f'FQId({repr(self.owner)}, {repr(self.repo)}, {repr(self.pr)})'
193  
194      @classmethod
195      def parse(cls, pull, default_repo):
196          '''Return FQId from 'owner/repo#id' or '#id' or 'id' string.'''
197          m = FQID_RE.match(pull)
198          if m:
199              return cls(m.group(1), m.group(2), int(m.group(3)))
200          m = PR_RE.match(pull)
201          if m:
202              (owner, repo) = default_repo.split('/')
203              return cls(owner, repo, int(m.group(1)))
204          raise ValueError(f'Cannot parse {pull} as PR specification.')
205  
206  def tests():
207      '''Quick internal sanity tests.'''
208      assert(FQId.parse('bitcoin/bitcoin#1234', 'bitcoin/bitcoin') == FQId('bitcoin', 'bitcoin', 1234))
209      assert(FQId.parse('bitcoin-core/gui#1235', 'bitcoin/bitcoin') == FQId('bitcoin-core', 'gui', 1235))
210      assert(FQId.parse('#1236', 'bitcoin/bitcoin') == FQId('bitcoin', 'bitcoin', 1236))
211      assert(FQId.parse('1237', 'bitcoin/bitcoin') == FQId('bitcoin', 'bitcoin', 1237))
212      assert(str(FQId('bitcoin', 'bitcoin', 1239)) == 'bitcoin/bitcoin#1239')
213      assert(FQId('bitcoin', 'bitcoin', 1239) < FQId('bitcoin', 'bitcoin', 1240))
214      assert(not (FQId('bitcoin', 'bitcoin', 1240) < FQId('bitcoin', 'bitcoin', 1239)))
215      assert(FQId('bitcoin', 'bitcoin', 1240) < FQId('bitcoin-core', 'gui', 1239))
216      assert(not (FQId('bitcoin-core', 'gui', 1239) < FQId('bitcoin', 'bitcoin', 1240)))
217  
218  # == Main program ==
219  tests()
220  ref_from = sys.argv[1] # 'v29.1rc1'
221  ref_to = sys.argv[2] # 'master'
222  
223  # read exclude file
224  exclude_pulls = set()
225  
226  if len(sys.argv) >= 4:
227      exclude_file = sys.argv[3]
228      try:
229          with open(exclude_file, 'r') as f:
230              d = json.load(f)
231              exclude_pulls = set(FQId.parse(str(p['id']), DEFAULT_REPO) for p in d['pulls'])
232          print(f'Excluding {", ".join(str(p) for p in exclude_pulls)}')
233          print()
234      except IOError as e:
235          print(f'Unable to read exclude file {exclude_file}', file=sys.stderr)
236          exit(1)
237  
238  # set of all commits
239  commits = subprocess.check_output([GIT, 'rev-list', '--reverse', '--topo-order', ref_from+'..'+ref_to])
240  commits = commits.decode()
241  commits = remove_last_if_empty(commits.splitlines())
242  commits_list = commits
243  commits = set(commits)
244  
245  CommitData = namedtuple('CommitData', ['sha', 'message', 'title', 'parents'])
246  commit_data = {}
247  
248  # collect data
249  for commit in commits:
250      info = subprocess.check_output([GIT, 'show', '-s', '--format=%B%x00%P', commit])
251      info = info.decode()
252      (message, parents) = info.split('\0')
253      title = message.rstrip().splitlines()[0]
254      parents = parents.rstrip().split(' ')
255      commit_data[commit] = CommitData(commit, message, title, parents)
256  
257  class CommitMetaData:
258      pull = None
259      rebased_from = None
260  
261      def __repr__(self):
262          return 'CommitMetadata(pull=%s,rebased_from=%s)' % (self.pull,self.rebased_from)
263  
264  def parse_commit_message(msg):
265      '''
266      Parse backport commit message.
267      '''
268      retval = CommitMetaData()
269      for line in msg.splitlines():
270          if line.startswith('Github-Pull:'):
271              param = line[12:].strip()
272              if param.startswith('#'): # compensate for incorrect #bitcoin-core/gui#148
273                  param = param[1:]
274              retval.pull = FQId.parse(param, DEFAULT_REPO)
275          if line.startswith('Rebased-From:'):
276              retval.rebased_from = line[13:].strip().split()
277      if retval.pull is not None:
278          return retval
279      else:
280          return None
281  
282  # traverse merge commits
283  pulls = {}
284  PullData = namedtuple('PullData', ['id', 'merge', 'commits', 'index'])
285  orphans = set(commits)
286  MERGE_RE = re.compile('Merge (.*?):')
287  for c in commit_data.values():
288      # is merge commit
289      if len(c.parents)>1:
290          assert(len(c.parents)==2)
291          match = MERGE_RE.match(c.title)
292          if match: # merges a pull request
293              if c.sha in orphans:
294                  orphans.remove(c.sha)
295              #print('removing ', c.sha)
296              sub_commits = subprocess.check_output([GIT, 'rev-list', c.parents[0]+'..'+c.parents[1]])
297              sub_commits = sub_commits.decode()
298              sub_commits = set(sub_commits.rstrip().splitlines())
299              pull = FQId.parse(match.group(1), DEFAULT_REPO)
300  
301              # remove commits that are not in the global list
302              sub_commits = sub_commits.intersection(commits)
303              for cs in sub_commits:
304                  if cs in orphans:
305                      orphans.remove(cs)
306  
307              if not pull in exclude_pulls:
308                  # if any sub-commits left, report them
309                  if sub_commits:
310                      # only report pull if any new commit went into the release
311                      index = commits_list.index(c.sha)
312                      pulls[pull] = PullData(pull, c.sha, sub_commits, index)
313  
314                      # look up commits and see if they point to master pulls
315                      # (=backport pull)
316                      # add those too
317                      sub_pulls = defaultdict(list)
318                      for cid in sub_commits:
319                          md = parse_commit_message(commit_data[cid].message)
320                          if md:
321                              sub_pulls[md.pull].append(cid)
322  
323                      if not sub_pulls and 'backport' in c.title.lower():
324                          # just information for manual checking
325                          print(f'{pull}: Merge PR title {repr(c.title)} contains \'backport\' but there are no sub-pulls')
326  
327                      for (sub_pull, sub_pull_commits) in sub_pulls.items():
328                          pulls[sub_pull] = PullData(sub_pull, sub_pull_commits[0], sub_pull_commits, index)
329          else:
330              print(f'{c.sha}: Merge commit does not merge a PR: {c.title}')
331  
332  # Extract remaining pull numbers from orphans, if they're backports
333  for o in set(orphans):
334      c = commit_data[o]
335      md = parse_commit_message(commit_data[o].message)
336      if md:
337          pulls[md.pull] = PullData(md.pull, c.sha, [], commits_list.index(c.sha))
338          orphans.remove(o)
339  
340  # Sort by index in commits list
341  # This results in approximately chronological order
342  pulls_order = list(pulls.values())
343  pulls_order.sort(key=lambda p:p.index)
344  pulls_order = [p.id for p in pulls_order]
345  # pulls_order = sorted(pulls.keys())
346  
347  def guess_category_from_labels(repo_info, labels):
348      '''
349      Guess category for a PR from github labels.
350      '''
351      labels = [l.lower() for l in labels]
352      for (label_list, category) in repo_info['label_mapping']:
353          for l in labels:
354              if l in label_list:
355                  return category
356      return repo_info['default_category']
357  
358  def get_category(repo_info, labels, message):
359      '''
360      Guess category for a PR from repository, labels and message prefixes.
361      Strip category from message.
362      '''
363      category = guess_category_from_labels(repo_info, labels)
364      message = message.strip()
365  
366      for (prefix, p_category, do_strip) in repo_info['prefixes']:
367          for variant in [('[' + prefix + ']:'), ('[' + prefix + ']'), (prefix + ':')]:
368              if message.lower().startswith(variant):
369                  category = p_category
370                  message = message[len(variant):].lstrip()
371                  if not do_strip: # if strip is not requested, re-add prefix in sanitized way
372                      message = prefix + ': ' + message.capitalize()
373  
374      return (category, message)
375  
376  pull_meta = {}
377  pull_labels = {}
378  per_category = defaultdict(list)
379  for pull in pulls_order:
380      repo_info = REPO_INFO[f'{pull.owner}/{pull.repo}']
381  
382      # Find github metadata for PR, if available
383      data0 = None
384      data1 = {'title': '{Not found}', 'user': {'login':'unknown'}}
385      if repo_info['ghmeta'] is not None:
386          filename = f'{repo_info["ghmeta"]}/issues/{pull.pr//100}xx/{pull.pr}.json'
387          try:
388              with open(filename, 'r') as f:
389                  data0 = json.load(f)
390          except IOError as e:
391              pass
392  
393          filename = f'{repo_info["ghmeta"]}/issues/{pull.pr//100}xx/{pull.pr}-PR.json'
394          try:
395              with open(filename, 'r') as f:
396                  data1 = json.load(f)
397          except IOError as e:
398              pass
399  
400      message = data1['title']
401      author = data1['user']['login']
402      if data0 is not None:
403          labels = [l['name'] for l in data0['labels']]
404      else:
405          labels = ['Missing']
406  
407      # nightmarish UTF tweaking to fix broken output of export script
408      message = message.encode('ISO-8859-1', errors='replace').decode(errors='replace')
409  
410      # consistent ellipsis
411      message = message.replace('...', '…')
412      # no '.' at end
413      if message.endswith('.'):
414          message = message[0:-1]
415  
416      # determine category and new message from message
417      category, message = get_category(repo_info, labels, message)
418      data1['title'] = message
419  
420      per_category[category].append((pull, message, author))
421      pull_labels[pull] = labels 
422      pull_meta[pull] = data1
423      
424  for _,category in LABEL_MAPPING:
425      if not per_category[category]:
426          continue
427      print('### %s' % category)
428      for dd in per_category[category]:
429          print(f'- {dd[0]} {dd[1]} ({dd[2]})')
430      print()
431  
432  if per_category[UNCATEGORIZED]:
433      print('### %s' % UNCATEGORIZED)
434      for dd in per_category[UNCATEGORIZED]:
435          print(f'- {dd[0]} {dd[1]} ({dd[2]}) (labels: {pull_labels[dd[0]]})')
436      print()
437  
438  print('### Orphan commits')
439  for o in orphans:
440      c = commit_data[o]
441      print('- `%s` %s' % (o[0:7], c.title))
442  
443  # write to json structure for postprocessing
444  commits_d = []
445  for c in commits_list:
446      commits_d.append(commit_data[c])
447  
448  pulls_d = []
449  for pull in sorted(pulls.keys()):
450      pd = pulls[pull]
451      pulls_d.append(
452              {'id': str(pd.id),
453              'merge': pd.merge,
454              'commits': list(pd.commits),
455              'meta': pull_meta[pd.id]})
456  
457  data_out = {
458      'commits': commits_d,
459      'pulls': pulls_d,
460      'orphans': list(orphans),
461  }
462  
463  with open('pulls.json','w') as f:
464      json.dump(data_out, f, sort_keys=True,
465                             indent=4, separators=(',', ': '))
466