export-to-pdf.py
1 #!/usr/bin/env python3 2 """ 3 Business Plan Markdown to PDF Converter 4 5 Uses the existing export-to-html.py logic but with print-optimized CSS, 6 then converts to PDF via WeasyPrint. 7 8 Usage: 9 python3 export-to-pdf.py auditandfix-business-plan.md auditandfix-business-plan.pdf 10 """ 11 12 import re 13 import sys 14 15 16 def slugify(text): 17 text = re.sub(r'<[^>]+>', '', text) 18 text = text.lower() 19 text = re.sub(r'[^\w\s-]', '', text) 20 text = re.sub(r'[\s_]+', '-', text) 21 text = re.sub(r'-+', '-', text) 22 return text.strip('-') 23 24 25 def extract_headings(md_content): 26 # Headings to skip in the TOC 27 skip = {'Contents', 'Table of Contents'} 28 headings = [] 29 for match in re.finditer(r'^(#{1,4})\s+(.+)$', md_content, re.MULTILINE): 30 level = len(match.group(1)) 31 raw = match.group(2) 32 text = raw.strip('*').strip() 33 # Skip date line, inline TOC heading 34 if text.startswith('Date:') or text in skip: 35 continue 36 slug = slugify(text) 37 headings.append({'level': level, 'text': text, 'slug': slug}) 38 return headings 39 40 41 def markdown_to_html(md_content): 42 html = md_content 43 44 # Remove HTML comments (TODOs etc) 45 html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL) 46 47 # Remove the date heading line (rendered on cover page instead) 48 html = re.sub(r'^## \*\*Date:\*\*.*$', '', html, flags=re.MULTILINE) 49 50 # Remove all inline TOC sections (## Contents / ## Table of Contents + list of links) 51 # Matches heading followed by numbered or bullet links until first "---" 52 html = re.sub( 53 r'^## (?:Contents|Table of Contents)\n(?:\n|\d+\.\s+\[.*\]\(.*\)\n?|- \[.*\]\(.*\)\n?)*\n*---', 54 '<hr>', 55 html, 56 flags=re.MULTILINE 57 ) 58 59 # Convert headers with slugified IDs 60 for n in [4, 3, 2, 1]: 61 pattern = r'^' + ('#' * n) + r'\s+(.+)$' 62 def make_replacer(level): 63 def replacer(match): 64 text = match.group(1).strip() 65 slug = slugify(text) 66 return f'<h{level} id="{slug}">{text}</h{level}>' 67 return replacer 68 html = re.sub(pattern, make_replacer(n), html, flags=re.MULTILINE) 69 70 # Convert tables 71 table_pattern = r'\|(.+)\|\n\|[-:\| ]+\|\n((?:\|.+\|\n?)+)' 72 def convert_table(match): 73 header = match.group(1) 74 rows = match.group(2) 75 header_cells = [f'<th>{cell.strip()}</th>' for cell in header.split('|') if cell.strip()] 76 header_row = f'<tr>{"".join(header_cells)}</tr>' 77 body_rows = [] 78 for row in rows.strip().split('\n'): 79 cells = [f'<td>{cell.strip()}</td>' for cell in row.split('|') if cell.strip()] 80 if cells: 81 body_rows.append(f'<tr>{"".join(cells)}</tr>') 82 return f'<table>\n<thead>{header_row}</thead>\n<tbody>{"".join(body_rows)}</tbody>\n</table>' 83 html = re.sub(table_pattern, convert_table, html, flags=re.MULTILINE) 84 85 # Strikethrough ~~text~~ 86 html = re.sub(r'~~(.+?)~~', r'<del>\1</del>', html) 87 88 # Bold and italic (double-star first, then single-star, then underscores) 89 html = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', html) 90 html = re.sub(r'\*(.+?)\*', r'<em>\1</em>', html) 91 html = re.sub(r'(?<!\w)_(.+?)_(?!\w)', r'<em>\1</em>', html) 92 93 # Links 94 html = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'<a href="\2">\1</a>', html) 95 96 # Horizontal rules 97 html = re.sub(r'^---+$', '<hr>', html, flags=re.MULTILINE) 98 99 # Numbered lists 100 html = re.sub(r'^\d+\.\s+(.+)$', r'<li>\1</li>', html, flags=re.MULTILINE) 101 102 # Unordered lists (with indentation support) 103 html = re.sub(r'^\s*[\-\*]\s+(.+)$', r'<li>\1</li>', html, flags=re.MULTILINE) 104 105 # Wrap consecutive <li> in <ul> 106 html = re.sub(r'((?:<li>.*?</li>\n?)+)', r'<ul>\n\1</ul>\n', html) 107 108 # Convert paragraphs 109 lines = html.split('\n') 110 result = [] 111 in_paragraph = False 112 for line in lines: 113 stripped = line.strip() 114 if not stripped: 115 if in_paragraph: 116 result.append('</p>') 117 in_paragraph = False 118 result.append('') 119 elif stripped.startswith('<') or '|' in stripped: 120 if in_paragraph: 121 result.append('</p>') 122 in_paragraph = False 123 result.append(line) 124 else: 125 if not in_paragraph: 126 result.append('<p>') 127 in_paragraph = True 128 result.append(line) 129 if in_paragraph: 130 result.append('</p>') 131 132 return '\n'.join(result) 133 134 135 def build_toc_page(headings): 136 """Build a print-friendly TOC with page numbers (via target-counter).""" 137 toc = ['<div class="toc-page">'] 138 toc.append('<h1 class="toc-title">Contents</h1>') 139 toc.append('<ul class="toc">') 140 for h in headings: 141 if h['level'] <= 3: 142 indent = 'toc-l' + str(h['level']) 143 toc.append( 144 f'<li class="{indent}">' 145 f'<a href="#{h["slug"]}">{h["text"]}</a>' 146 f'</li>' 147 ) 148 toc.append('</ul>') 149 toc.append('</div>') 150 return '\n'.join(toc) 151 152 153 def create_pdf_html(body_html, toc_html, title="Audit&Fix Business Plan"): 154 return f'''<!DOCTYPE html> 155 <html lang="en"> 156 <head> 157 <meta charset="UTF-8"> 158 <title>{title}</title> 159 <style> 160 @page {{ 161 size: A4; 162 margin: 20mm 18mm 25mm 18mm; 163 @bottom-center {{ 164 content: counter(page); 165 font-size: 9pt; 166 color: #888; 167 }} 168 }} 169 @page :first {{ 170 @bottom-center {{ content: none; }} 171 }} 172 body {{ 173 font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif; 174 font-size: 10pt; 175 line-height: 1.5; 176 color: #222; 177 margin: 0; 178 padding: 0; 179 }} 180 181 /* Cover page */ 182 .cover {{ 183 page-break-after: always; 184 text-align: center; 185 padding-top: 120px; 186 }} 187 .cover h1 {{ 188 font-size: 32pt; 189 color: #1a5276; 190 border: none; 191 margin-bottom: 20px; 192 }} 193 .cover .subtitle {{ 194 font-size: 14pt; 195 color: #555; 196 margin-bottom: 40px; 197 }} 198 .cover .meta {{ 199 font-size: 11pt; 200 color: #777; 201 margin-top: 60px; 202 }} 203 .cover .meta p {{ 204 margin: 6px 0; 205 }} 206 207 /* TOC */ 208 .toc-page {{ 209 page-break-after: always; 210 }} 211 .toc-page h1, h1.toc-title {{ 212 font-size: 18pt; 213 color: #1a5276; 214 border-bottom: 2px solid #1a5276; 215 padding-bottom: 8px; 216 page-break-before: avoid; 217 }} 218 ul.toc {{ 219 list-style: none; 220 padding: 0; 221 }} 222 ul.toc li {{ 223 padding: 4px 0; 224 border-bottom: 1px dotted #ccc; 225 }} 226 ul.toc li a {{ 227 color: #333; 228 text-decoration: none; 229 display: block; 230 }} 231 ul.toc li a::after {{ 232 content: target-counter(attr(href url), page); 233 float: right; 234 color: #888; 235 }} 236 .toc-l1 {{ font-size: 12pt; font-weight: 700; padding-left: 0; margin-top: 10px; }} 237 .toc-l2 {{ font-size: 11pt; font-weight: 600; padding-left: 15px; }} 238 .toc-l3 {{ font-size: 10pt; font-weight: 400; padding-left: 30px; }} 239 240 /* Headings */ 241 h1 {{ 242 font-size: 20pt; 243 color: #1a5276; 244 border-bottom: 2px solid #1a5276; 245 padding-bottom: 6px; 246 margin-top: 30px; 247 page-break-after: avoid; 248 }} 249 h2 {{ 250 font-size: 16pt; 251 color: #2c3e50; 252 border-bottom: 1px solid #bdc3c7; 253 padding-bottom: 4px; 254 margin-top: 24px; 255 page-break-before: always; 256 page-break-after: avoid; 257 }} 258 /* Don't break before the very first h2 (date line) or the financial spreadsheets sub-h2s */ 259 h2:first-of-type {{ page-break-before: auto; }} 260 h3 {{ 261 font-size: 13pt; 262 color: #34495e; 263 margin-top: 18px; 264 page-break-after: avoid; 265 }} 266 h4 {{ 267 font-size: 11pt; 268 color: #555; 269 margin-top: 14px; 270 page-break-after: avoid; 271 }} 272 273 /* Tables */ 274 table {{ 275 width: 100%; 276 border-collapse: collapse; 277 margin: 12px 0; 278 font-size: 8.5pt; 279 page-break-inside: auto; 280 }} 281 thead {{ display: table-header-group; }} 282 tr {{ page-break-inside: avoid; }} 283 th, td {{ 284 border: 1px solid #ccc; 285 padding: 5px 7px; 286 text-align: left; 287 vertical-align: top; 288 }} 289 th {{ 290 background-color: #1a5276; 291 color: white; 292 font-weight: 600; 293 }} 294 tr:nth-child(even) {{ 295 background-color: #f7f9fc; 296 }} 297 298 /* Lists */ 299 ul {{ 300 margin: 8px 0; 301 padding-left: 22px; 302 }} 303 li {{ 304 margin: 3px 0; 305 }} 306 307 /* Misc */ 308 hr {{ 309 border: none; 310 border-top: 1px solid #ddd; 311 margin: 20px 0; 312 }} 313 a {{ color: #2980b9; text-decoration: none; }} 314 p {{ margin: 6px 0; }} 315 strong {{ color: #1a1a1a; }} 316 del {{ text-decoration: line-through; color: #999; }} 317 code {{ 318 font-family: 'Courier New', monospace; 319 font-size: 9pt; 320 background: #f4f4f4; 321 padding: 1px 4px; 322 border-radius: 2px; 323 }} 324 pre {{ 325 background: #f4f4f4; 326 border: 1px solid #ddd; 327 border-radius: 3px; 328 padding: 10px; 329 font-size: 8.5pt; 330 overflow-x: auto; 331 page-break-inside: avoid; 332 }} 333 </style> 334 </head> 335 <body> 336 337 <div class="cover"> 338 <h1>Audit&Fix</h1> 339 <p class="subtitle">Business Plan</p> 340 <div class="meta"> 341 <p>AI-Powered Website Conversion Rate Optimization</p> 342 <p>March 2026</p> 343 <p>Sole Trader — NSW, Australia</p> 344 </div> 345 </div> 346 347 {toc_html} 348 349 <div class="content"> 350 {body_html} 351 </div> 352 353 </body> 354 </html>''' 355 356 357 def main(): 358 if len(sys.argv) != 3: 359 print("Usage: python3 export-to-pdf.py input.md output.pdf") 360 sys.exit(1) 361 362 input_file = sys.argv[1] 363 output_file = sys.argv[2] 364 365 try: 366 with open(input_file, 'r', encoding='utf-8') as f: 367 md_content = f.read() 368 except FileNotFoundError: 369 print(f"Error: Input file '{input_file}' not found") 370 sys.exit(1) 371 372 print("Extracting headings...") 373 headings = extract_headings(md_content) 374 375 print("Converting markdown to HTML...") 376 html_body = markdown_to_html(md_content) 377 toc_html = build_toc_page(headings) 378 379 title_match = re.search(r'<h1[^>]*>([^<]+)</h1>', html_body) 380 title = title_match.group(1) if title_match else "Audit&Fix Business Plan" 381 382 html_document = create_pdf_html(html_body, toc_html, title) 383 384 # Write intermediate HTML for debugging 385 html_path = output_file.replace('.pdf', '-print.html') 386 with open(html_path, 'w', encoding='utf-8') as f: 387 f.write(html_document) 388 print(f"Intermediate HTML: {html_path}") 389 390 print("Generating PDF with WeasyPrint...") 391 from weasyprint import HTML 392 HTML(string=html_document).write_pdf(output_file) 393 394 import os 395 size_mb = os.path.getsize(output_file) / (1024 * 1024) 396 print(f"PDF created: {output_file} ({size_mb:.1f} MB)") 397 398 399 if __name__ == '__main__': 400 main()