Cradicle Explorer

export-to-pdf.py
  1  #!/usr/bin/env python3
  2  """
  3  Business Plan Markdown to PDF Converter
  4  
  5  Uses the existing export-to-html.py logic but with print-optimized CSS,
  6  then converts to PDF via WeasyPrint.
  7  
  8  Usage:
  9      python3 export-to-pdf.py auditandfix-business-plan.md auditandfix-business-plan.pdf
 10  """
 11  
 12  import re
 13  import sys
 14  
 15  
 16  def slugify(text):
 17      text = re.sub(r'<[^>]+>', '', text)
 18      text = text.lower()
 19      text = re.sub(r'[^\w\s-]', '', text)
 20      text = re.sub(r'[\s_]+', '-', text)
 21      text = re.sub(r'-+', '-', text)
 22      return text.strip('-')
 23  
 24  
 25  def extract_headings(md_content):
 26      # Headings to skip in the TOC
 27      skip = {'Contents', 'Table of Contents'}
 28      headings = []
 29      for match in re.finditer(r'^(#{1,4})\s+(.+)$', md_content, re.MULTILINE):
 30          level = len(match.group(1))
 31          raw = match.group(2)
 32          text = raw.strip('*').strip()
 33          # Skip date line, inline TOC heading
 34          if text.startswith('Date:') or text in skip:
 35              continue
 36          slug = slugify(text)
 37          headings.append({'level': level, 'text': text, 'slug': slug})
 38      return headings
 39  
 40  
 41  def markdown_to_html(md_content):
 42      html = md_content
 43  
 44      # Remove HTML comments (TODOs etc)
 45      html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
 46  
 47      # Remove the date heading line (rendered on cover page instead)
 48      html = re.sub(r'^## \*\*Date:\*\*.*$', '', html, flags=re.MULTILINE)
 49  
 50      # Remove all inline TOC sections (## Contents / ## Table of Contents + list of links)
 51      # Matches heading followed by numbered or bullet links until first "---"
 52      html = re.sub(
 53          r'^## (?:Contents|Table of Contents)\n(?:\n|\d+\.\s+\[.*\]\(.*\)\n?|- \[.*\]\(.*\)\n?)*\n*---',
 54          '<hr>',
 55          html,
 56          flags=re.MULTILINE
 57      )
 58  
 59      # Convert headers with slugified IDs
 60      for n in [4, 3, 2, 1]:
 61          pattern = r'^' + ('#' * n) + r'\s+(.+)$'
 62          def make_replacer(level):
 63              def replacer(match):
 64                  text = match.group(1).strip()
 65                  slug = slugify(text)
 66                  return f'<h{level} id="{slug}">{text}</h{level}>'
 67              return replacer
 68          html = re.sub(pattern, make_replacer(n), html, flags=re.MULTILINE)
 69  
 70      # Convert tables
 71      table_pattern = r'\|(.+)\|\n\|[-:\| ]+\|\n((?:\|.+\|\n?)+)'
 72      def convert_table(match):
 73          header = match.group(1)
 74          rows = match.group(2)
 75          header_cells = [f'<th>{cell.strip()}</th>' for cell in header.split('|') if cell.strip()]
 76          header_row = f'<tr>{"".join(header_cells)}</tr>'
 77          body_rows = []
 78          for row in rows.strip().split('\n'):
 79              cells = [f'<td>{cell.strip()}</td>' for cell in row.split('|') if cell.strip()]
 80              if cells:
 81                  body_rows.append(f'<tr>{"".join(cells)}</tr>')
 82          return f'<table>\n<thead>{header_row}</thead>\n<tbody>{"".join(body_rows)}</tbody>\n</table>'
 83      html = re.sub(table_pattern, convert_table, html, flags=re.MULTILINE)
 84  
 85      # Strikethrough ~~text~~
 86      html = re.sub(r'~~(.+?)~~', r'<del>\1</del>', html)
 87  
 88      # Bold and italic (double-star first, then single-star, then underscores)
 89      html = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', html)
 90      html = re.sub(r'\*(.+?)\*', r'<em>\1</em>', html)
 91      html = re.sub(r'(?<!\w)_(.+?)_(?!\w)', r'<em>\1</em>', html)
 92  
 93      # Links
 94      html = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'<a href="\2">\1</a>', html)
 95  
 96      # Horizontal rules
 97      html = re.sub(r'^---+$', '<hr>', html, flags=re.MULTILINE)
 98  
 99      # Numbered lists
100      html = re.sub(r'^\d+\.\s+(.+)$', r'<li>\1</li>', html, flags=re.MULTILINE)
101  
102      # Unordered lists (with indentation support)
103      html = re.sub(r'^\s*[\-\*]\s+(.+)$', r'<li>\1</li>', html, flags=re.MULTILINE)
104  
105      # Wrap consecutive <li> in <ul>
106      html = re.sub(r'((?:<li>.*?</li>\n?)+)', r'<ul>\n\1</ul>\n', html)
107  
108      # Convert paragraphs
109      lines = html.split('\n')
110      result = []
111      in_paragraph = False
112      for line in lines:
113          stripped = line.strip()
114          if not stripped:
115              if in_paragraph:
116                  result.append('</p>')
117                  in_paragraph = False
118              result.append('')
119          elif stripped.startswith('<') or '|' in stripped:
120              if in_paragraph:
121                  result.append('</p>')
122                  in_paragraph = False
123              result.append(line)
124          else:
125              if not in_paragraph:
126                  result.append('<p>')
127                  in_paragraph = True
128              result.append(line)
129      if in_paragraph:
130          result.append('</p>')
131  
132      return '\n'.join(result)
133  
134  
135  def build_toc_page(headings):
136      """Build a print-friendly TOC with page numbers (via target-counter)."""
137      toc = ['<div class="toc-page">']
138      toc.append('<h1 class="toc-title">Contents</h1>')
139      toc.append('<ul class="toc">')
140      for h in headings:
141          if h['level'] <= 3:
142              indent = 'toc-l' + str(h['level'])
143              toc.append(
144                  f'<li class="{indent}">'
145                  f'<a href="#{h["slug"]}">{h["text"]}</a>'
146                  f'</li>'
147              )
148      toc.append('</ul>')
149      toc.append('</div>')
150      return '\n'.join(toc)
151  
152  
153  def create_pdf_html(body_html, toc_html, title="Audit&Fix Business Plan"):
154      return f'''<!DOCTYPE html>
155  <html lang="en">
156  <head>
157  <meta charset="UTF-8">
158  <title>{title}</title>
159  <style>
160      @page {{
161          size: A4;
162          margin: 20mm 18mm 25mm 18mm;
163          @bottom-center {{
164              content: counter(page);
165              font-size: 9pt;
166              color: #888;
167          }}
168      }}
169      @page :first {{
170          @bottom-center {{ content: none; }}
171      }}
172      body {{
173          font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
174          font-size: 10pt;
175          line-height: 1.5;
176          color: #222;
177          margin: 0;
178          padding: 0;
179      }}
180  
181      /* Cover page */
182      .cover {{
183          page-break-after: always;
184          text-align: center;
185          padding-top: 120px;
186      }}
187      .cover h1 {{
188          font-size: 32pt;
189          color: #1a5276;
190          border: none;
191          margin-bottom: 20px;
192      }}
193      .cover .subtitle {{
194          font-size: 14pt;
195          color: #555;
196          margin-bottom: 40px;
197      }}
198      .cover .meta {{
199          font-size: 11pt;
200          color: #777;
201          margin-top: 60px;
202      }}
203      .cover .meta p {{
204          margin: 6px 0;
205      }}
206  
207      /* TOC */
208      .toc-page {{
209          page-break-after: always;
210      }}
211      .toc-page h1, h1.toc-title {{
212          font-size: 18pt;
213          color: #1a5276;
214          border-bottom: 2px solid #1a5276;
215          padding-bottom: 8px;
216          page-break-before: avoid;
217      }}
218      ul.toc {{
219          list-style: none;
220          padding: 0;
221      }}
222      ul.toc li {{
223          padding: 4px 0;
224          border-bottom: 1px dotted #ccc;
225      }}
226      ul.toc li a {{
227          color: #333;
228          text-decoration: none;
229          display: block;
230      }}
231      ul.toc li a::after {{
232          content: target-counter(attr(href url), page);
233          float: right;
234          color: #888;
235      }}
236      .toc-l1 {{ font-size: 12pt; font-weight: 700; padding-left: 0; margin-top: 10px; }}
237      .toc-l2 {{ font-size: 11pt; font-weight: 600; padding-left: 15px; }}
238      .toc-l3 {{ font-size: 10pt; font-weight: 400; padding-left: 30px; }}
239  
240      /* Headings */
241      h1 {{
242          font-size: 20pt;
243          color: #1a5276;
244          border-bottom: 2px solid #1a5276;
245          padding-bottom: 6px;
246          margin-top: 30px;
247          page-break-after: avoid;
248      }}
249      h2 {{
250          font-size: 16pt;
251          color: #2c3e50;
252          border-bottom: 1px solid #bdc3c7;
253          padding-bottom: 4px;
254          margin-top: 24px;
255          page-break-before: always;
256          page-break-after: avoid;
257      }}
258      /* Don't break before the very first h2 (date line) or the financial spreadsheets sub-h2s */
259      h2:first-of-type {{ page-break-before: auto; }}
260      h3 {{
261          font-size: 13pt;
262          color: #34495e;
263          margin-top: 18px;
264          page-break-after: avoid;
265      }}
266      h4 {{
267          font-size: 11pt;
268          color: #555;
269          margin-top: 14px;
270          page-break-after: avoid;
271      }}
272  
273      /* Tables */
274      table {{
275          width: 100%;
276          border-collapse: collapse;
277          margin: 12px 0;
278          font-size: 8.5pt;
279          page-break-inside: auto;
280      }}
281      thead {{ display: table-header-group; }}
282      tr {{ page-break-inside: avoid; }}
283      th, td {{
284          border: 1px solid #ccc;
285          padding: 5px 7px;
286          text-align: left;
287          vertical-align: top;
288      }}
289      th {{
290          background-color: #1a5276;
291          color: white;
292          font-weight: 600;
293      }}
294      tr:nth-child(even) {{
295          background-color: #f7f9fc;
296      }}
297  
298      /* Lists */
299      ul {{
300          margin: 8px 0;
301          padding-left: 22px;
302      }}
303      li {{
304          margin: 3px 0;
305      }}
306  
307      /* Misc */
308      hr {{
309          border: none;
310          border-top: 1px solid #ddd;
311          margin: 20px 0;
312      }}
313      a {{ color: #2980b9; text-decoration: none; }}
314      p {{ margin: 6px 0; }}
315      strong {{ color: #1a1a1a; }}
316      del {{ text-decoration: line-through; color: #999; }}
317      code {{
318          font-family: 'Courier New', monospace;
319          font-size: 9pt;
320          background: #f4f4f4;
321          padding: 1px 4px;
322          border-radius: 2px;
323      }}
324      pre {{
325          background: #f4f4f4;
326          border: 1px solid #ddd;
327          border-radius: 3px;
328          padding: 10px;
329          font-size: 8.5pt;
330          overflow-x: auto;
331          page-break-inside: avoid;
332      }}
333  </style>
334  </head>
335  <body>
336  
337  <div class="cover">
338      <h1>Audit&Fix</h1>
339      <p class="subtitle">Business Plan</p>
340      <div class="meta">
341          <p>AI-Powered Website Conversion Rate Optimization</p>
342          <p>March 2026</p>
343          <p>Sole Trader &mdash; NSW, Australia</p>
344      </div>
345  </div>
346  
347  {toc_html}
348  
349  <div class="content">
350  {body_html}
351  </div>
352  
353  </body>
354  </html>'''
355  
356  
357  def main():
358      if len(sys.argv) != 3:
359          print("Usage: python3 export-to-pdf.py input.md output.pdf")
360          sys.exit(1)
361  
362      input_file = sys.argv[1]
363      output_file = sys.argv[2]
364  
365      try:
366          with open(input_file, 'r', encoding='utf-8') as f:
367              md_content = f.read()
368      except FileNotFoundError:
369          print(f"Error: Input file '{input_file}' not found")
370          sys.exit(1)
371  
372      print("Extracting headings...")
373      headings = extract_headings(md_content)
374  
375      print("Converting markdown to HTML...")
376      html_body = markdown_to_html(md_content)
377      toc_html = build_toc_page(headings)
378  
379      title_match = re.search(r'<h1[^>]*>([^<]+)</h1>', html_body)
380      title = title_match.group(1) if title_match else "Audit&Fix Business Plan"
381  
382      html_document = create_pdf_html(html_body, toc_html, title)
383  
384      # Write intermediate HTML for debugging
385      html_path = output_file.replace('.pdf', '-print.html')
386      with open(html_path, 'w', encoding='utf-8') as f:
387          f.write(html_document)
388      print(f"Intermediate HTML: {html_path}")
389  
390      print("Generating PDF with WeasyPrint...")
391      from weasyprint import HTML
392      HTML(string=html_document).write_pdf(output_file)
393  
394      import os
395      size_mb = os.path.getsize(output_file) / (1024 * 1024)
396      print(f"PDF created: {output_file} ({size_mb:.1f} MB)")
397  
398  
399  if __name__ == '__main__':
400      main()