update-md-links
1 #!/usr/bin/env python3 2 # 3 # Usage: 4 # 5 # maint/update-md-links CHANGELOG.md 6 # 7 # Updates a markdown file, merging/inserting links from `gen_md_links` 8 # 9 # New links are added to the next to the first place in the file 10 # where links are defined. 11 # 12 # When we don't know what a link target should be, we emit four Xs. 13 14 15 # ALGORITHM 16 # 17 # Look for annotations in the file telling us how to wirk. 18 # 19 # Split the file up into sections (divided by the specified heading level). 20 # 21 # In each section, find existing link def lines 22 # (ie, lines giving the target `t` for a link anchor text `a`). 23 # This gives us 24 # - for each section 25 # - for each anchor, locations of relevant defs in this section 26 # - targets for some anchors 27 # 28 # Feed each section separately to gen_md_links 29 # This gives us 30 # - for each section, needed anchor set 31 # - targets, for some anchors 32 # 33 # Reconcile definitions, to obtain precisely one target for each anchor. 34 # 35 # for each section. 36 # For each needed anchor 37 # If there are def(s) in this section, before the def collection, OK 38 # Otherwise write a a definition line to the new collection 39 # Collect defs from existing final link def collection (if any) 40 # Sort the collection 41 # Replace the relevant part of the file with the normalised collection 42 # (possibly *adding* the collection) 43 44 from __future__ import annotations 45 import argparse 46 import collections 47 import filecmp 48 import os 49 import re 50 import subprocess 51 import sys 52 import tempfile 53 54 from typing import Any, Optional, Tuple, Union 55 56 # ---------- "constant" definitions ---------- 57 58 # regexps 59 60 link_def_re = re.compile(r"\[([^][]+)\]\:\s?(.*)\n?") 61 heading_re = re.compile(r"(\#+)\s") 62 instruction_re = re.compile(r"\<\!\-\-\@\@\s+update-md-links\s*(.*\S)\s*\-\-\>\s*$") 63 instruction_val_re = re.compile(r"\s*([-0-9a-z]+)\s+(.*\S)\s*") 64 65 # ---------- set up globals: parse command line and read the input file ---------- 66 67 # "instructions" we understand in comments like this: 68 # <!--@@ update-md-links INSTRUCTION VALUE --> 69 instructions = { 70 "split-heading-level": 0, 71 "section-blank-lines": 1, 72 } 73 74 parser = argparse.ArgumentParser( 75 prog="update-md-links", 76 description="update links in a markdown document", 77 ) 78 parser.add_argument("filename") 79 parser.add_argument( 80 "--check", 81 action="store_true", 82 help="Check that everything is up to date; make no changes", 83 ) 84 85 args = parser.parse_args() 86 md = list(open(args.filename, "r")) 87 88 # ---------- types used in our data structures ---------- 89 90 Anchor = str 91 Target = str 92 Source = str 93 94 # One definition, of `t`, with human-readable source `source`, at md line `md_i` 95 # 96 # The link definition might be from maint/gen_md_links, in which case `md_i` is `None`. 97 Def = collections.namedtuple("Def", ["t", "source", "md_i"]) 98 99 Defs = dict[Anchor, list[Def]] 100 Resolved = dict[Anchor, Target] 101 102 # One section (if we have `split-heading-level` of other than 0). 103 # 104 # Comprises lines [`start`, `end`). 105 # `defs_t2i` maps link target to a list of line numbers it's defined on 106 # `a_needed` will be set to True when we find it actually contains any link 107 Section = collections.namedtuple("Section", ["start", "end", "defs_t2i", "a_needed"]) 108 109 # ---------- utility functions ---------- 110 111 troubles = 0 112 113 114 def trouble(m: str) -> None: 115 """ 116 Record a "trouble" - a nonfatal problem. 117 118 Prints the message `m` to stderr, and increments `troubles`, so we exit nonzero, later. 119 """ 120 global troubles 121 print("trouble: " + m, file=sys.stderr) 122 troubles += 1 123 124 125 CheckableSubprocess = Union[subprocess.CompletedProcess[Any], subprocess.Popen[str]] 126 127 128 def check_returncode(process: CheckableSubprocess) -> None: 129 """ 130 Check `process.returncode`; if it's not zero, print an error message and exit nonzero. 131 """ 132 r = process.returncode 133 if r != 0: 134 print("subprocess failed with nonzero returncode %s" % r, file=sys.stderr) 135 sys.exit(12) 136 137 138 def is_link_def(line: str) -> Optional[Tuple[str, str]]: 139 """ 140 Is `line` in the syntax of a link definition? 141 142 If so returns `(a, t)` where `a` is the anchor and `t` the target. 143 Otherwise returns None. 144 """ 145 g = link_def_re.fullmatch(line) 146 if g is None: 147 return None 148 # mypy can't see that our regexp has precisely these captures 149 r: Tuple[str, str] = g.groups() # type: ignore 150 return r 151 152 153 # ---------- search for instructions ---------- 154 155 156 def process_instructions() -> None: 157 """ 158 Looks for instructions and updates the global `instructions`. 159 """ 160 161 for i, l in enumerate(md): 162 source = "%s:%d" % (args.filename, i + 1) 163 164 g = instruction_re.fullmatch(l) 165 if not g: 166 continue 167 168 g = instruction_val_re.fullmatch(g.group(1)) 169 if g: 170 kv: Tuple[str, str] = g.groups() # type: ignore 171 k, v = kv 172 if k not in instructions: 173 trouble("%s: unknown value instruction %s" % (source, k)) 174 instructions[k] = int(v) 175 continue 176 177 trouble("%s: unknown instruction" % source) 178 179 180 # ---------- break input into sections ---------- 181 182 183 def split_input() -> list[Section]: 184 """ 185 Parse `md` into sections. 186 """ 187 188 sections = [] 189 190 section_start = 0 191 192 # namedtuple has defaults= but it gives every fresh tuple an aliased copy of the 193 # same value! So we provide this constructor. 194 def new_section(start: int, end: int) -> None: 195 sections.append(Section(start, end, {}, {})) 196 197 for i, l in enumerate(md): 198 g = heading_re.match(l) 199 if g and len(g.group(1)) == int(instructions["split-heading-level"]): 200 if i != section_start: 201 new_section(section_start, i) 202 section_start = i 203 204 new_section(section_start, len(md)) 205 206 return sections 207 208 209 # ---------- scan input sections' contents ---------- 210 211 212 def scan_sections(sections: list[Section]) -> Defs: 213 """ 214 Scans each section in `sections` 215 """ 216 link_defs: Defs = {} 217 218 def record_link_def( 219 a: Anchor, t: Target, source: Source, md_i: Optional[int] 220 ) -> None: 221 """ 222 Record that anchor `a` is defined to have target url `t`. 223 224 `source` and `md_i` are as for `Def`. 225 226 `t` may be the empty string (and for output from `gen_md_links`, often is). 227 """ 228 link_defs.setdefault(a, []).append(Def(t, source, md_i)) 229 230 for s in sections: 231 # ---------- for each section, find existing link def lines ---------- 232 233 for i in range(s.start, s.end): 234 lno = i + 1 235 line = md[i] 236 at = is_link_def(line) 237 if at: 238 a, t = at 239 record_link_def(a, t, "%s:%d" % (args.filename, lno), i) 240 s.defs_t2i.setdefault(t, []).append(i) 241 242 # ---------- for each section, run gen_md_links ---------- 243 244 text_file = tempfile.TemporaryFile(mode="w+", buffering=True) 245 for i in range(s.start, s.end): 246 print(md[i], file=text_file) 247 text_file.flush() 248 text_file.seek(0, 0) 249 gen_links_output = subprocess.Popen( 250 ["maint/gen_md_links", "--", "-"], 251 stdin=text_file, 252 stdout=subprocess.PIPE, 253 encoding="utf-8", 254 ) 255 assert gen_links_output.stdout 256 for line in gen_links_output.stdout: 257 line = line.strip() 258 if line == "": 259 continue 260 at = is_link_def(line) 261 if at is None: 262 print( 263 "gen_md_links produced bad output line %s (for %s:%d..%d)" 264 % (repr(line), args.filename, s.start + 1, s.end), 265 file=sys.stderr, 266 ) 267 sys.exit(12) 268 a, t = at 269 record_link_def(a, t, "gen_md_links", None) 270 s.a_needed[a] = True 271 272 gen_links_output.wait() 273 check_returncode(gen_links_output) 274 275 return link_defs 276 277 278 # ---------- reconcile definitions ---------- 279 280 281 def resolve_definitions(link_defs: Defs) -> Resolved: 282 """ 283 Resolve link definitions. 284 """ 285 link_def = {} 286 287 for a, defs in link_defs.items(): 288 candidates: dict[Target, list[Source]] = {} 289 for d in defs: 290 if d.t.strip() != "": 291 candidates.setdefault(d.t, []).append(d.source) 292 293 ts = list(candidates.keys()) 294 if len(ts) > 1: 295 trouble("conflicting definitions for [%s]" % a) 296 done: dict[Target, bool] = {} 297 for d in defs: 298 t = d.t 299 if done.get(t): 300 continue 301 done[t] = True 302 print(" candidate %s" % t, file=sys.stderr) 303 for d in defs: 304 if d.t != t: 305 continue 306 print(" defined %s" % d.source, file=sys.stderr) 307 if len(ts) == 0: 308 ts.append("XX" + "XX") 309 310 link_def[a] = ts[0] 311 312 return link_def 313 314 315 # ---------- collate outputs ---------- 316 317 318 def collate_insert_outputs( 319 sections: list[Section], link_defs: Defs, link_def: Resolved 320 ) -> None: 321 """ 322 Collate link definitions into each section 323 324 Updates `md` in place. 325 """ 326 for s in sections: 327 linkcoll_start = s.end 328 while True: 329 if linkcoll_start <= s.start: 330 break 331 prev = linkcoll_start - 1 332 prev_l = md[prev] 333 if prev_l.strip() != "" and not is_link_def(prev_l): 334 break 335 linkcoll_start = prev 336 337 if linkcoll_start <= s.start: 338 continue # section contains only links, ignore it 339 340 if not s.a_needed: 341 continue # section contains no link anchors, ignore it 342 343 # Now linkcoll_start is the start of the link collection for this section. 344 # (Including blank lines either siude of the link collection.) 345 346 new_collection = [] 347 348 for a in s.a_needed: 349 found = False 350 for d in link_defs[a]: 351 i = d.md_i 352 if i is None: 353 continue 354 if i < s.start or i >= linkcoll_start: 355 continue 356 found = True 357 break 358 if not found: 359 new_collection.append("[%s]: %s\n" % (a, link_def[a])) 360 361 # delete old collection 362 for i in range(linkcoll_start, s.end): 363 md[i] = "" 364 365 o = "" 366 if len(new_collection) != 0: 367 new_collection.sort() 368 o += "\n" + "".join(new_collection) 369 370 if s.end != len(md): 371 for i in range(0, int(instructions["section-blank-lines"])): 372 o += "\n" 373 374 md[linkcoll_start - 1] += o 375 376 377 # ---------- write output ---------- 378 379 380 def write_output() -> None: 381 """ 382 Writes the output file 383 384 Writes to a `.tmp`, and then runs diff, or installs it, as appropriate. 385 """ 386 new_filename = "%s.tmp" % args.filename 387 output = open(new_filename, "w", buffering=True) 388 for line in md: 389 print(line, file=output, end="") 390 output.close() 391 392 if troubles != 0: 393 print("trouble, not installing %s" % new_filename, file=sys.stderr) 394 sys.exit(12) 395 396 if args.check: 397 r = subprocess.run(["diff", "-u", "--", args.filename, new_filename]) 398 if r.returncode == 1: 399 print("%s links not up to date." % args.filename, file=sys.stderr) 400 sys.exit(1) 401 check_returncode(r) 402 os.remove(new_filename) 403 else: 404 if filecmp.cmp(args.filename, new_filename): 405 print("%s unchanged" % args.filename) 406 else: 407 print("%s *updated*!" % args.filename) 408 os.rename(new_filename, args.filename) 409 410 411 # ---------- main program ---------- 412 413 process_instructions() 414 sections = split_input() 415 link_defs = scan_sections(sections) 416 link_def = resolve_definitions(link_defs) 417 collate_insert_outputs(sections, link_defs, link_def) 418 write_output()