/ maint / extract-md-links
extract-md-links
  1  #!/usr/bin/env python3
  2  
  3  """
  4  Extract the reference link definitions, and uses, from a .md file.
  5  
  6  They are extracted *without normalisation* - in particular,
  7  without case folding.  This is contrary to markdown semantics,
  8  but it is desirable if we want to retain the original case.
  9  
 10  When run as a program, prints a json document
 11  
 12  {
 13     "used": ["anchor", ...],
 14     "defined"`: {"anchor": ["target", "title"] }
 15  }
 16  
 17  ("title" can be null instead)
 18  """
 19  
 20  # Basically all markdown parsers seem to treat undefined [foo]
 21  # link references as literal text, including the [ ].
 22  # I investigated several parsers including pandoc, marked (JS),
 23  # and python3-markdown, and none of them seemed to have a way to
 24  # override this or extract a list of apparently-unreferenced links.
 25  #
 26  # mistune has a hook mechanism, which we can abuse to insert
 27  # instrumentation that spots when link definitions are queried,
 28  # during processing.
 29  
 30  import mistune  # type: ignore
 31  from typing import Tuple
 32  
 33  
 34  class Tracking:
 35      """
 36      Data structure which tracks used and defined keys.
 37  
 38      You may access the properties `used` and `defined`;
 39      `defined` mas each key to `(target, title)`.
 40      `used` is a map from keys to `True`,
 41  
 42      The keys here are *un*normalised, so they have not been lowercased.
 43      """
 44  
 45      defined: dict[str, Tuple[str, str]] = {}
 46      used: dict[str, bool] = {}
 47  
 48      def as_json(self):
 49          return json.dumps(
 50              {
 51                  "used": list(self.used.keys()),
 52                  "defined": self.defined,
 53              }
 54          )
 55  
 56  
 57  class TrackingBlockParser(mistune.BlockParser):
 58      def __init__(self, track):
 59          self.track = track
 60          super().__init__()
 61  
 62      def parse_def_link(self, m, state):
 63          k = m.group(1)
 64          t = m.group(2)
 65          title = m.group(3)
 66          self.track.defined[k] = (t, title)
 67          return super().parse_def_link(m, state)
 68  
 69  
 70  class TrackingInlineParser(mistune.InlineParser):
 71      def __init__(self, track, renderer):
 72          self.track = track
 73          super().__init__(renderer)
 74  
 75      def parse_ref_link(self, m, state):
 76          k = m.group(2) or m.group(1)
 77          self.track.used[k] = True
 78          return super().parse_ref_link(m, state)
 79  
 80  
 81  def extract_links(md_string):
 82      """
 83      Given a markdown file, as a string, returns a `TrackingDict`
 84      containing information about its ref links.
 85      """
 86  
 87      track = Tracking()
 88  
 89      # Our construction is reaching into the mistune innards more than ideal.
 90      # It works with Debian's python3-mistune 2.0.4-1.
 91      renderer = renderer = mistune.AstRenderer()
 92      md = mistune.Markdown(
 93          renderer,
 94          block=TrackingBlockParser(track),
 95          inline=TrackingInlineParser(track, renderer),
 96      )
 97      md(md_string)
 98      return track
 99  
100  
101  if __name__ == "__main__":
102      # In theory we ought to be able to load file this as a Python module
103      # instead of running it as a script.  But this does not work
104      # because the Python module loading machinery insists that the filename
105      # must end in .py.  But script names ought not to end in .py.
106      #
107      # The recipe here
108      #    https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
109      # does not work with a filename not ending in .py:
110      # "importlib.util.spec_from_file_location" returns None.
111  
112      import sys
113      import json
114      import argparse
115  
116      parser = argparse.ArgumentParser(prog="extract-md-links")
117      parser.add_argument("filename", nargs="?", default="-")
118      args = parser.parse_args()
119  
120      if args.filename == "-":
121          in_file = sys.stdin
122      else:
123          in_file = open(args.filename, "r")
124  
125      text = in_file.read()
126      print(extract_links(text).as_json())