/ maint / update-md-links
update-md-links
  1  #!/usr/bin/env python3
  2  #
  3  # Usage:
  4  #
  5  #   maint/update-md-links CHANGELOG.md
  6  #
  7  # Updates a markdown file, merging/inserting links from `gen_md_links`
  8  #
  9  # New links are added to the next to the first place in the file
 10  # where links are defined.
 11  #
 12  # When we don't know what a link target should be, we emit four Xs.
 13  
 14  
 15  # ALGORITHM
 16  #
 17  # Look for annotations in the file telling us how to wirk.
 18  #
 19  # Split the file up into sections (divided by the specified heading level).
 20  #
 21  # In each section, find existing link def lines
 22  # (ie, lines giving the target `t` for a link anchor text `a`).
 23  # This gives us
 24  #   - for each section
 25  #     - for each anchor, locations of relevant defs in this section
 26  #   - targets for some anchors
 27  #
 28  # Feed each section separately to gen_md_links
 29  # This gives us
 30  #   - for each section, needed anchor set
 31  #   - targets, for some anchors
 32  #
 33  # Reconcile definitions, to obtain precisely one target for each anchor.
 34  #
 35  # for each section.
 36  #   For each needed anchor
 37  #     If there are def(s) in this section, before the def collection, OK
 38  #     Otherwise write a a definition line to the new collection
 39  #   Collect defs from existing final link def collection (if any)
 40  #   Sort the collection
 41  #   Replace the relevant part of the file with the normalised collection
 42  #   (possibly *adding* the collection)
 43  
 44  from __future__ import annotations
 45  import argparse
 46  import collections
 47  import filecmp
 48  import os
 49  import re
 50  import subprocess
 51  import sys
 52  import tempfile
 53  
 54  from typing import Any, Optional, Tuple, Union
 55  
 56  # ---------- "constant" definitions ----------
 57  
 58  # regexps
 59  
 60  link_def_re = re.compile(r"\[([^][]+)\]\:\s?(.*)\n?")
 61  heading_re = re.compile(r"(\#+)\s")
 62  instruction_re = re.compile(r"\<\!\-\-\@\@\s+update-md-links\s*(.*\S)\s*\-\-\>\s*$")
 63  instruction_val_re = re.compile(r"\s*([-0-9a-z]+)\s+(.*\S)\s*")
 64  
 65  # ---------- set up globals: parse command line and read the input file ----------
 66  
 67  # "instructions" we understand in comments like this:
 68  # <!--@@ update-md-links INSTRUCTION VALUE -->
 69  instructions = {
 70      "split-heading-level": 0,
 71      "section-blank-lines": 1,
 72  }
 73  
 74  parser = argparse.ArgumentParser(
 75      prog="update-md-links",
 76      description="update links in a markdown document",
 77  )
 78  parser.add_argument("filename")
 79  parser.add_argument(
 80      "--check",
 81      action="store_true",
 82      help="Check that everything is up to date; make no changes",
 83  )
 84  
 85  args = parser.parse_args()
 86  md = list(open(args.filename, "r"))
 87  
 88  # ---------- types used in our data structures ----------
 89  
 90  Anchor = str
 91  Target = str
 92  Source = str
 93  
 94  # One definition, of `t`, with human-readable source `source`, at md line `md_i`
 95  #
 96  # The link definition might be from maint/gen_md_links, in which case `md_i` is `None`.
 97  Def = collections.namedtuple("Def", ["t", "source", "md_i"])
 98  
 99  Defs = dict[Anchor, list[Def]]
100  Resolved = dict[Anchor, Target]
101  
102  # One section (if we have `split-heading-level` of other than 0).
103  #
104  # Comprises lines [`start`, `end`).
105  # `defs_t2i` maps link target to a list of line numbers it's defined on
106  # `a_needed` will be set to True when we find it actually contains any link
107  Section = collections.namedtuple("Section", ["start", "end", "defs_t2i", "a_needed"])
108  
109  # ---------- utility functions ----------
110  
111  troubles = 0
112  
113  
114  def trouble(m: str) -> None:
115      """
116      Record a "trouble" - a nonfatal problem.
117  
118      Prints the message `m` to stderr, and increments `troubles`, so we exit nonzero, later.
119      """
120      global troubles
121      print("trouble: " + m, file=sys.stderr)
122      troubles += 1
123  
124  
125  CheckableSubprocess = Union[subprocess.CompletedProcess[Any], subprocess.Popen[str]]
126  
127  
128  def check_returncode(process: CheckableSubprocess) -> None:
129      """
130      Check `process.returncode`; if it's not zero, print an error message and exit nonzero.
131      """
132      r = process.returncode
133      if r != 0:
134          print("subprocess failed with nonzero returncode %s" % r, file=sys.stderr)
135          sys.exit(12)
136  
137  
138  def is_link_def(line: str) -> Optional[Tuple[str, str]]:
139      """
140      Is `line` in the syntax of a link definition?
141  
142      If so returns `(a, t)` where `a` is the anchor and `t` the target.
143      Otherwise returns None.
144      """
145      g = link_def_re.fullmatch(line)
146      if g is None:
147          return None
148      # mypy can't see that our regexp has precisely these captures
149      r: Tuple[str, str] = g.groups()  # type: ignore
150      return r
151  
152  
153  # ---------- search for instructions ----------
154  
155  
156  def process_instructions() -> None:
157      """
158      Looks for instructions and updates the global `instructions`.
159      """
160  
161      for i, l in enumerate(md):
162          source = "%s:%d" % (args.filename, i + 1)
163  
164          g = instruction_re.fullmatch(l)
165          if not g:
166              continue
167  
168          g = instruction_val_re.fullmatch(g.group(1))
169          if g:
170              kv: Tuple[str, str] = g.groups()  # type: ignore
171              k, v = kv
172              if k not in instructions:
173                  trouble("%s: unknown value instruction %s" % (source, k))
174              instructions[k] = int(v)
175              continue
176  
177          trouble("%s: unknown instruction" % source)
178  
179  
180  # ---------- break input into sections ----------
181  
182  
183  def split_input() -> list[Section]:
184      """
185      Parse `md` into sections.
186      """
187  
188      sections = []
189  
190      section_start = 0
191  
192      # namedtuple has defaults= but it gives every fresh tuple an aliased copy of the
193      # same value!  So we provide this constructor.
194      def new_section(start: int, end: int) -> None:
195          sections.append(Section(start, end, {}, {}))
196  
197      for i, l in enumerate(md):
198          g = heading_re.match(l)
199          if g and len(g.group(1)) == int(instructions["split-heading-level"]):
200              if i != section_start:
201                  new_section(section_start, i)
202              section_start = i
203  
204      new_section(section_start, len(md))
205  
206      return sections
207  
208  
209  # ---------- scan input sections' contents ----------
210  
211  
212  def scan_sections(sections: list[Section]) -> Defs:
213      """
214      Scans each section in `sections`
215      """
216      link_defs: Defs = {}
217  
218      def record_link_def(
219          a: Anchor, t: Target, source: Source, md_i: Optional[int]
220      ) -> None:
221          """
222          Record that anchor `a` is defined to have target url `t`.
223  
224          `source` and `md_i` are as for `Def`.
225  
226          `t` may be the empty string (and for output from `gen_md_links`, often is).
227          """
228          link_defs.setdefault(a, []).append(Def(t, source, md_i))
229  
230      for s in sections:
231          # ---------- for each section, find existing link def lines ----------
232  
233          for i in range(s.start, s.end):
234              lno = i + 1
235              line = md[i]
236              at = is_link_def(line)
237              if at:
238                  a, t = at
239                  record_link_def(a, t, "%s:%d" % (args.filename, lno), i)
240                  s.defs_t2i.setdefault(t, []).append(i)
241  
242          # ---------- for each section, run gen_md_links ----------
243  
244          text_file = tempfile.TemporaryFile(mode="w+", buffering=True)
245          for i in range(s.start, s.end):
246              print(md[i], file=text_file)
247          text_file.flush()
248          text_file.seek(0, 0)
249          gen_links_output = subprocess.Popen(
250              ["maint/gen_md_links", "--", "-"],
251              stdin=text_file,
252              stdout=subprocess.PIPE,
253              encoding="utf-8",
254          )
255          assert gen_links_output.stdout
256          for line in gen_links_output.stdout:
257              line = line.strip()
258              if line == "":
259                  continue
260              at = is_link_def(line)
261              if at is None:
262                  print(
263                      "gen_md_links produced bad output line %s (for %s:%d..%d)"
264                      % (repr(line), args.filename, s.start + 1, s.end),
265                      file=sys.stderr,
266                  )
267                  sys.exit(12)
268              a, t = at
269              record_link_def(a, t, "gen_md_links", None)
270              s.a_needed[a] = True
271  
272          gen_links_output.wait()
273          check_returncode(gen_links_output)
274  
275      return link_defs
276  
277  
278  # ---------- reconcile definitions ----------
279  
280  
281  def resolve_definitions(link_defs: Defs) -> Resolved:
282      """
283      Resolve link definitions.
284      """
285      link_def = {}
286  
287      for a, defs in link_defs.items():
288          candidates: dict[Target, list[Source]] = {}
289          for d in defs:
290              if d.t.strip() != "":
291                  candidates.setdefault(d.t, []).append(d.source)
292  
293          ts = list(candidates.keys())
294          if len(ts) > 1:
295              trouble("conflicting definitions for [%s]" % a)
296              done: dict[Target, bool] = {}
297              for d in defs:
298                  t = d.t
299                  if done.get(t):
300                      continue
301                  done[t] = True
302                  print("  candidate %s" % t, file=sys.stderr)
303                  for d in defs:
304                      if d.t != t:
305                          continue
306                      print("    defined %s" % d.source, file=sys.stderr)
307          if len(ts) == 0:
308              ts.append("XX" + "XX")
309  
310          link_def[a] = ts[0]
311  
312      return link_def
313  
314  
315  # ---------- collate outputs ----------
316  
317  
318  def collate_insert_outputs(
319      sections: list[Section], link_defs: Defs, link_def: Resolved
320  ) -> None:
321      """
322      Collate link definitions into each section
323  
324      Updates `md` in place.
325      """
326      for s in sections:
327          linkcoll_start = s.end
328          while True:
329              if linkcoll_start <= s.start:
330                  break
331              prev = linkcoll_start - 1
332              prev_l = md[prev]
333              if prev_l.strip() != "" and not is_link_def(prev_l):
334                  break
335              linkcoll_start = prev
336  
337          if linkcoll_start <= s.start:
338              continue  # section contains only links, ignore it
339  
340          if not s.a_needed:
341              continue  # section contains no link anchors, ignore it
342  
343          # Now linkcoll_start is the start of the link collection for this section.
344          # (Including blank lines either siude of the link collection.)
345  
346          new_collection = []
347  
348          for a in s.a_needed:
349              found = False
350              for d in link_defs[a]:
351                  i = d.md_i
352                  if i is None:
353                      continue
354                  if i < s.start or i >= linkcoll_start:
355                      continue
356                  found = True
357                  break
358              if not found:
359                  new_collection.append("[%s]: %s\n" % (a, link_def[a]))
360  
361          # delete old collection
362          for i in range(linkcoll_start, s.end):
363              md[i] = ""
364  
365          o = ""
366          if len(new_collection) != 0:
367              new_collection.sort()
368              o += "\n" + "".join(new_collection)
369  
370          if s.end != len(md):
371              for i in range(0, int(instructions["section-blank-lines"])):
372                  o += "\n"
373  
374          md[linkcoll_start - 1] += o
375  
376  
377  # ---------- write output ----------
378  
379  
380  def write_output() -> None:
381      """
382      Writes the output file
383  
384      Writes to a `.tmp`, and then runs diff, or installs it, as appropriate.
385      """
386      new_filename = "%s.tmp" % args.filename
387      output = open(new_filename, "w", buffering=True)
388      for line in md:
389          print(line, file=output, end="")
390      output.close()
391  
392      if troubles != 0:
393          print("trouble, not installing %s" % new_filename, file=sys.stderr)
394          sys.exit(12)
395  
396      if args.check:
397          r = subprocess.run(["diff", "-u", "--", args.filename, new_filename])
398          if r.returncode == 1:
399              print("%s links not up to date." % args.filename, file=sys.stderr)
400              sys.exit(1)
401          check_returncode(r)
402          os.remove(new_filename)
403      else:
404          if filecmp.cmp(args.filename, new_filename):
405              print("%s unchanged" % args.filename)
406          else:
407              print("%s *updated*!" % args.filename)
408          os.rename(new_filename, args.filename)
409  
410  
411  # ---------- main program ----------
412  
413  process_instructions()
414  sections = split_input()
415  link_defs = scan_sections(sections)
416  link_def = resolve_definitions(link_defs)
417  collate_insert_outputs(sections, link_defs, link_def)
418  write_output()