web.py
1 """ppp-web: Fetch a webpage and render it as a print-optimized PDF.""" 2 3 import argparse 4 import base64 5 import os 6 import re 7 import shutil 8 import subprocess 9 import sys 10 import tempfile 11 from urllib.parse import urljoin, urlparse 12 13 import requests 14 from bs4 import BeautifulSoup, Comment 15 from readability import Document 16 from weasyprint import HTML 17 18 from ppp._util import get_data_path 19 20 21 # --------------------------------------------------------------------------- 22 # Content extraction 23 # --------------------------------------------------------------------------- 24 25 def fetch_page(url): 26 """Fetch a webpage and return the raw HTML.""" 27 headers = { 28 "User-Agent": ( 29 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " 30 "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" 31 ) 32 } 33 resp = requests.get(url, headers=headers, timeout=30) 34 resp.raise_for_status() 35 # requests sometimes guesses wrong encoding (e.g. ISO-8859-1 when the 36 # page is actually UTF-8). If the server didn't declare a charset in the 37 # Content-Type header, force UTF-8 which is correct for virtually all 38 # modern websites. 39 if "charset" not in resp.headers.get("Content-Type", ""): 40 resp.encoding = "utf-8" 41 return resp.text 42 43 44 def extract_content(raw_html, url): 45 """Use readability to pull out the main article content. 46 47 Returns (title, article_html). 48 """ 49 doc = Document(raw_html, url=url) 50 title = doc.title() 51 article_html = doc.summary(html_partial=True) 52 return title, article_html 53 54 55 # --------------------------------------------------------------------------- 56 # HTML cleanup 57 # --------------------------------------------------------------------------- 58 59 _HAS_INKSCAPE = shutil.which("inkscape") is not None 60 61 62 def _rasterize_svg(svg_bytes, output_width=1500): 63 """Rasterize SVG bytes to a high-res PNG via Inkscape. 64 65 Returns a data:image/png;base64,... URI, or None if rasterization fails. 66 Inkscape handles complex SVG features (masks, filters, external fonts) 67 far better than Cairo-based renderers. 68 """ 69 if not _HAS_INKSCAPE: 70 return None 71 tmp_svg = None 72 tmp_png = None 73 try: 74 tmp_svg = tempfile.NamedTemporaryFile(suffix=".svg", delete=False) 75 tmp_svg.write(svg_bytes) 76 tmp_svg.close() 77 tmp_png = tmp_svg.name.replace(".svg", ".png") 78 subprocess.run( 79 ["inkscape", tmp_svg.name, 80 "--export-type=png", 81 f"--export-filename={tmp_png}", 82 f"--export-width={output_width}"], 83 capture_output=True, timeout=30, 84 ) 85 if os.path.isfile(tmp_png) and os.path.getsize(tmp_png) > 0: 86 with open(tmp_png, "rb") as f: 87 encoded = base64.b64encode(f.read()).decode("ascii") 88 return f"data:image/png;base64,{encoded}" 89 except Exception: 90 pass 91 finally: 92 for p in (tmp_svg and tmp_svg.name, tmp_png): 93 if p and os.path.exists(p): 94 os.unlink(p) 95 return None 96 97 98 def _make_svg_scalable(svg_bytes): 99 """Strip fixed width/height from an SVG so it scales to its container. 100 101 Preserves the viewBox so the aspect ratio is maintained. 102 """ 103 svg_text = svg_bytes.decode("utf-8", errors="replace") 104 def strip_root_dims(m): 105 tag = m.group(0) 106 tag = re.sub(r'\s+width\s*=\s*"[^"]*"', '', tag) 107 tag = re.sub(r"\s+width\s*=\s*'[^']*'", '', tag) 108 tag = re.sub(r'\s+height\s*=\s*"[^"]*"', '', tag) 109 tag = re.sub(r"\s+height\s*=\s*'[^']*'", '', tag) 110 tag = tag.replace("<svg", '<svg width="100%"', 1) 111 return tag 112 svg_text = re.sub(r'<svg[^>]*>', strip_root_dims, svg_text, count=1) 113 return svg_text.encode("utf-8") 114 115 116 def _fetch_and_encode_svg(svg_url): 117 """Download an SVG and return a data URI (PNG raster or scalable SVG).""" 118 try: 119 resp = requests.get(svg_url, timeout=15) 120 resp.raise_for_status() 121 svg_bytes = resp.content 122 123 # Try Inkscape rasterization first (handles masks, filters, etc.) 124 png_uri = _rasterize_svg(svg_bytes) 125 if png_uri: 126 return png_uri 127 128 # Fallback: embed as scalable SVG data URI 129 svg_bytes = _make_svg_scalable(svg_bytes) 130 encoded = base64.b64encode(svg_bytes).decode("ascii") 131 return f"data:image/svg+xml;base64,{encoded}" 132 except Exception: 133 return svg_url 134 135 136 def clean_html(article_html, url): 137 """Normalize the extracted HTML for print rendering.""" 138 soup = BeautifulSoup(article_html, "lxml") 139 140 # Remove comments 141 for comment in soup.find_all(string=lambda t: isinstance(t, Comment)): 142 comment.extract() 143 144 # Remove scripts, styles, iframes, forms 145 for tag in soup.find_all(["script", "style", "iframe", "form", "nav", 146 "button", "input"]): 147 tag.decompose() 148 149 # Convert <object type="image/svg+xml"> to <img> tags. 150 # WeasyPrint renders <img src="*.svg"> reliably but not <object>. 151 for obj in soup.find_all("object"): 152 obj_type = (obj.get("type") or "").lower() 153 data_url = obj.get("data") 154 if "svg" in obj_type and data_url: 155 abs_url = urljoin(url, data_url) if not data_url.startswith(("http://", "https://")) else data_url 156 print(f" Fetching SVG: {abs_url}") 157 src = _fetch_and_encode_svg(abs_url) 158 img_tag = soup.new_tag("img", src=src) 159 img_tag["alt"] = obj.get("aria-label", "diagram") 160 obj.replace_with(img_tag) 161 else: 162 obj.decompose() 163 164 # Convert any inline <svg> elements to data-URI <img> tags so WeasyPrint 165 # can render them with proper sizing constraints. 166 for svg_tag in soup.find_all("svg"): 167 svg_html = str(svg_tag) 168 svg_scalable = _make_svg_scalable(svg_html.encode("utf-8")) 169 encoded = base64.b64encode(svg_scalable).decode("ascii") 170 src = f"data:image/svg+xml;base64,{encoded}" 171 img_tag = soup.new_tag("img", src=src) 172 img_tag["alt"] = "diagram" 173 svg_tag.replace_with(img_tag) 174 175 # Resolve relative URLs for images and links 176 for img in soup.find_all("img"): 177 src = img.get("src") 178 if src and not src.startswith(("http://", "https://", "data:")): 179 img["src"] = urljoin(url, src) 180 181 for a in soup.find_all("a"): 182 href = a.get("href") 183 if href and not href.startswith(("http://", "https://", "mailto:", "#")): 184 a["href"] = urljoin(url, href) 185 186 # Remove empty paragraphs 187 for p in soup.find_all("p"): 188 if not p.get_text(strip=True) and not p.find("img"): 189 p.decompose() 190 191 return soup 192 193 194 # --------------------------------------------------------------------------- 195 # Document assembly 196 # --------------------------------------------------------------------------- 197 198 def build_document(title, content_soup, url, css_path): 199 """Wrap extracted content in a full HTML document with print CSS.""" 200 with open(css_path, "r") as f: 201 css = f.read() 202 203 domain = urlparse(url).netloc 204 205 # Get the inner HTML from the content soup 206 # (content_soup may be a full document from lxml; grab the body contents) 207 body_tag = content_soup.find("body") 208 inner = "".join(str(c) for c in body_tag.children) if body_tag else str(content_soup) 209 210 html = f"""<!DOCTYPE html> 211 <html lang="en"> 212 <head> 213 <meta charset="utf-8"> 214 <title>{title}</title> 215 <style> 216 {css} 217 </style> 218 </head> 219 <body> 220 221 <!-- Running strings for page headers --> 222 <span class="ppp-title">{title}</span> 223 <span class="ppp-source">{domain}</span> 224 225 <!-- Title page --> 226 <div class="ppp-header"> 227 <h1>{title}</h1> 228 <div class="ppp-meta"> 229 Source: {url}<br> 230 Formatted for print by PPP 231 </div> 232 </div> 233 234 <!-- Content --> 235 <article> 236 {inner} 237 </article> 238 239 </body> 240 </html>""" 241 242 return html 243 244 245 # --------------------------------------------------------------------------- 246 # PDF rendering 247 # --------------------------------------------------------------------------- 248 249 def render_pdf(html_string, output_path, base_url=None): 250 """Render an HTML string to PDF via WeasyPrint.""" 251 html_doc = HTML(string=html_string, base_url=base_url) 252 html_doc.write_pdf(output_path) 253 254 255 def convert_to_grayscale(input_path, output_path): 256 """Convert a PDF to grayscale using Ghostscript.""" 257 tmp_out = output_path + ".gray.tmp" 258 subprocess.run( 259 ["gs", "-sDEVICE=pdfwrite", "-dNOPAUSE", "-dBATCH", "-dQUIET", 260 "-sColorConversionStrategy=Gray", 261 "-dProcessColorModel=/DeviceGray", 262 f"-sOutputFile={tmp_out}", input_path], 263 check=True, timeout=120, 264 ) 265 os.replace(tmp_out, output_path) 266 267 268 # --------------------------------------------------------------------------- 269 # CLI 270 # --------------------------------------------------------------------------- 271 272 def default_output_name(url): 273 """Derive a reasonable output filename from the URL.""" 274 parsed = urlparse(url) 275 # Use the path, stripping leading/trailing slashes 276 path = parsed.path.strip("/") 277 if not path: 278 path = parsed.netloc 279 # Replace slashes and dots with hyphens, keep it clean 280 name = re.sub(r"[/\.\s]+", "-", path) 281 name = re.sub(r"[^a-zA-Z0-9\-]", "", name) 282 name = name.strip("-")[:60] or "output" 283 return f"PPP-{name}.pdf" 284 285 286 PAPER_SIZES = { 287 "letter": "letter", 288 "a4": "A4", 289 "a5": "A5", 290 "legal": "legal", 291 } 292 293 294 def main(): 295 parser = argparse.ArgumentParser( 296 prog="ppp-web", 297 description="Fetch a webpage and render it as a print-optimized PDF.", 298 ) 299 parser.add_argument("url", help="URL of the webpage to format") 300 parser.add_argument("-o", "--output", help="Output PDF filename") 301 parser.add_argument( 302 "--paper", choices=list(PAPER_SIZES.keys()), default="letter", 303 help="Paper size (default: letter)", 304 ) 305 parser.add_argument( 306 "--font-size", type=float, default=11, 307 help="Base font size in points (default: 11)", 308 ) 309 parser.add_argument( 310 "--no-title-page", action="store_true", 311 help="Skip the title page", 312 ) 313 parser.add_argument( 314 "--grayscale", action="store_true", 315 help="Convert output to grayscale (saves ink)", 316 ) 317 318 args = parser.parse_args() 319 320 # Ensure URL has a scheme 321 url = args.url 322 if not url.startswith(("http://", "https://")): 323 url = "https://" + url 324 325 output = args.output or default_output_name(url) 326 327 # Locate the print stylesheet 328 css_path = get_data_path("print.css") 329 if css_path is None: 330 print("Error: could not locate print.css stylesheet", file=sys.stderr) 331 sys.exit(1) 332 333 print(f"Fetching {url} ...") 334 raw_html = fetch_page(url) 335 336 print("Extracting content ...") 337 title, article_html = extract_content(raw_html, url) 338 print(f" Title: {title}") 339 340 print("Cleaning up ...") 341 content_soup = clean_html(article_html, url) 342 343 print("Building document ...") 344 # Read CSS and apply overrides 345 with open(css_path, "r") as f: 346 css_text = f.read() 347 348 # Apply paper size override 349 if args.paper != "letter": 350 css_text = css_text.replace("size: letter;", f"size: {PAPER_SIZES[args.paper]};") 351 352 # Apply font size override 353 if args.font_size != 11: 354 css_text = css_text.replace("font-size: 11pt;", f"font-size: {args.font_size}pt;") 355 356 # Write a temp CSS with overrides so build_document can read it 357 import tempfile 358 tmp_css = tempfile.NamedTemporaryFile(mode="w", suffix=".css", delete=False) 359 tmp_css.write(css_text) 360 tmp_css.close() 361 362 try: 363 html_doc = build_document(title, content_soup, url, tmp_css.name) 364 365 # Strip title page if requested 366 if args.no_title_page: 367 html_doc = html_doc.replace( 368 'class="ppp-header"', 'class="ppp-header" style="display:none"' 369 ) 370 371 print(f"Rendering PDF ...") 372 render_pdf(html_doc, output, base_url=url) 373 finally: 374 os.unlink(tmp_css.name) 375 376 if args.grayscale: 377 print("Converting to grayscale ...") 378 convert_to_grayscale(output, output) 379 380 print(f"Done! Output: {output}") 381 382 383 if __name__ == "__main__": 384 main()