extract-md-links
1 #!/usr/bin/env python3 2 3 """ 4 Extract the reference link definitions, and uses, from a .md file. 5 6 They are extracted *without normalisation* - in particular, 7 without case folding. This is contrary to markdown semantics, 8 but it is desirable if we want to retain the original case. 9 10 When run as a program, prints a json document 11 12 { 13 "used": ["anchor", ...], 14 "defined"`: {"anchor": ["target", "title"] } 15 } 16 17 ("title" can be null instead) 18 """ 19 20 # Basically all markdown parsers seem to treat undefined [foo] 21 # link references as literal text, including the [ ]. 22 # I investigated several parsers including pandoc, marked (JS), 23 # and python3-markdown, and none of them seemed to have a way to 24 # override this or extract a list of apparently-unreferenced links. 25 # 26 # mistune has a hook mechanism, which we can abuse to insert 27 # instrumentation that spots when link definitions are queried, 28 # during processing. 29 30 import mistune # type: ignore 31 from typing import Tuple 32 33 34 class Tracking: 35 """ 36 Data structure which tracks used and defined keys. 37 38 You may access the properties `used` and `defined`; 39 `defined` mas each key to `(target, title)`. 40 `used` is a map from keys to `True`, 41 42 The keys here are *un*normalised, so they have not been lowercased. 43 """ 44 45 defined: dict[str, Tuple[str, str]] = {} 46 used: dict[str, bool] = {} 47 48 def as_json(self): 49 return json.dumps( 50 { 51 "used": list(self.used.keys()), 52 "defined": self.defined, 53 } 54 ) 55 56 57 class TrackingBlockParser(mistune.BlockParser): 58 def __init__(self, track): 59 self.track = track 60 super().__init__() 61 62 def parse_def_link(self, m, state): 63 k = m.group(1) 64 t = m.group(2) 65 title = m.group(3) 66 self.track.defined[k] = (t, title) 67 return super().parse_def_link(m, state) 68 69 70 class TrackingInlineParser(mistune.InlineParser): 71 def __init__(self, track, renderer): 72 self.track = track 73 super().__init__(renderer) 74 75 def parse_ref_link(self, m, state): 76 k = m.group(2) or m.group(1) 77 self.track.used[k] = True 78 return super().parse_ref_link(m, state) 79 80 81 def extract_links(md_string): 82 """ 83 Given a markdown file, as a string, returns a `TrackingDict` 84 containing information about its ref links. 85 """ 86 87 track = Tracking() 88 89 # Our construction is reaching into the mistune innards more than ideal. 90 # It works with Debian's python3-mistune 2.0.4-1. 91 renderer = renderer = mistune.AstRenderer() 92 md = mistune.Markdown( 93 renderer, 94 block=TrackingBlockParser(track), 95 inline=TrackingInlineParser(track, renderer), 96 ) 97 md(md_string) 98 return track 99 100 101 if __name__ == "__main__": 102 # In theory we ought to be able to load file this as a Python module 103 # instead of running it as a script. But this does not work 104 # because the Python module loading machinery insists that the filename 105 # must end in .py. But script names ought not to end in .py. 106 # 107 # The recipe here 108 # https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly 109 # does not work with a filename not ending in .py: 110 # "importlib.util.spec_from_file_location" returns None. 111 112 import sys 113 import json 114 import argparse 115 116 parser = argparse.ArgumentParser(prog="extract-md-links") 117 parser.add_argument("filename", nargs="?", default="-") 118 args = parser.parse_args() 119 120 if args.filename == "-": 121 in_file = sys.stdin 122 else: 123 in_file = open(args.filename, "r") 124 125 text = in_file.read() 126 print(extract_links(text).as_json())