/ docs / scripts / convert-notebooks.py
convert-notebooks.py
  1  """
  2  Converts all .ipynb files from the docs/ folder into .mdx files.
  3  
  4  This script uses nbconvert to do the processing.
  5  """
  6  
  7  import multiprocessing
  8  import re
  9  from pathlib import Path
 10  
 11  import nbformat
 12  import yaml
 13  from nbconvert.exporters import MarkdownExporter
 14  from nbconvert.preprocessors import Preprocessor
 15  
 16  SOURCE_DIR = Path("docs/")
 17  NOTEBOOK_BASE_EDIT_URL = "https://github.com/mlflow/mlflow/edit/master/docs/"
 18  NOTEBOOK_BASE_DOWNLOAD_URL = "https://raw.githubusercontent.com/mlflow/mlflow/master/docs/"
 19  
 20  
 21  class EscapeBackticksPreprocessor(Preprocessor):
 22      def preprocess_cell(self, cell, resources, cell_index):
 23          if cell.cell_type == "code":
 24              # escape backticks, as code blocks will be rendered
 25              # inside a custom react component like:
 26              # <NotebookCellOutput>`{{ content }}`</NotebookCellOutput>
 27              # and having the backticks causes issues
 28              cell.source = cell.source.replace("`", r"\`")
 29  
 30              if "outputs" in cell:
 31                  for i, output in enumerate(cell["outputs"]):
 32                      if "text" in output:
 33                          output["text"] = output["text"].replace("`", r"\`")
 34                      elif "data" in output:
 35                          for key, value in output["data"].items():
 36                              if isinstance(value, str):
 37                                  output["data"][key] = value.replace("`", r"\`")
 38          elif cell.cell_type == "raw":
 39              cell.source = cell.source.replace("<br>", "<br />")
 40  
 41          return cell, resources
 42  
 43  
 44  exporter = MarkdownExporter(
 45      preprocessors=[EscapeBackticksPreprocessor],
 46      template_name="mdx",
 47      extra_template_basedirs=["./scripts/nbconvert_templates"],
 48  )
 49  
 50  
 51  def add_frontmatter(
 52      body: str,
 53      nb_path: Path,
 54  ) -> str:
 55      frontmatter = {
 56          "custom_edit_url": NOTEBOOK_BASE_EDIT_URL + str(nb_path),
 57          "slug": nb_path.stem,
 58      }
 59      formatted_frontmatter = yaml.dump(frontmatter)
 60  
 61      return f"""---
 62  {formatted_frontmatter}
 63  ---
 64  
 65  {body}"""
 66  
 67  
 68  def add_download_button(
 69      body: str,
 70      nb_path: Path,
 71  ) -> str:
 72      download_url = NOTEBOOK_BASE_DOWNLOAD_URL + str(nb_path)
 73      download_button = f'<NotebookDownloadButton href="{download_url}">Download this notebook</NotebookDownloadButton>'
 74  
 75      # Insert the notebook underneath the first H1 header (assumed to be the title)
 76      pattern = r"(^#\s+.+$)"
 77      return re.sub(pattern, rf"\1\n\n{download_button}", body, count=1, flags=re.MULTILINE)
 78  
 79  
 80  # add the imports for our custom cell output components
 81  def add_custom_component_imports(
 82      body: str,
 83  ) -> str:
 84      return f"""import {{ NotebookCodeCell }} from "@site/src/components/NotebookCodeCell"
 85  import {{ NotebookCellOutput }} from "@site/src/components/NotebookCellOutput"
 86  import {{ NotebookHTMLOutput }} from "@site/src/components/NotebookHTMLOutput"
 87  import {{ NotebookDownloadButton }} from "@site/src/components/NotebookDownloadButton"
 88  
 89  {body}
 90  """
 91  
 92  
 93  def convert_path(nb_path: Path):
 94      mdx_path = nb_path.with_stem(nb_path.stem + "-ipynb").with_suffix(".mdx")
 95      with open(nb_path) as f:
 96          nb = nbformat.read(f, as_version=4)
 97  
 98          body, _ = exporter.from_notebook_node(nb)
 99          body = add_custom_component_imports(body)
100          body = add_frontmatter(body, nb_path)
101          body = add_download_button(body, nb_path)
102  
103          with open(mdx_path, "w") as f:
104              f.write(body)
105  
106          return mdx_path
107  
108  
109  def main():
110      nb_paths = list(SOURCE_DIR.rglob("*.ipynb"))
111  
112      with multiprocessing.Pool() as pool:
113          pool.map(convert_path, nb_paths)
114  
115  
116  if __name__ == "__main__":
117      main()