Cradicle Explorer

/ src / ppp / web.py
web.py
  1  """ppp-web: Fetch a webpage and render it as a print-optimized PDF."""
  2  
  3  import argparse
  4  import base64
  5  import os
  6  import re
  7  import shutil
  8  import subprocess
  9  import sys
 10  import tempfile
 11  from urllib.parse import urljoin, urlparse
 12  
 13  import requests
 14  from bs4 import BeautifulSoup, Comment
 15  from readability import Document
 16  from weasyprint import HTML
 17  
 18  from ppp._util import get_data_path
 19  
 20  
 21  # ---------------------------------------------------------------------------
 22  # Content extraction
 23  # ---------------------------------------------------------------------------
 24  
 25  def fetch_page(url):
 26      """Fetch a webpage and return the raw HTML."""
 27      headers = {
 28          "User-Agent": (
 29              "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
 30              "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
 31          )
 32      }
 33      resp = requests.get(url, headers=headers, timeout=30)
 34      resp.raise_for_status()
 35      # requests sometimes guesses wrong encoding (e.g. ISO-8859-1 when the
 36      # page is actually UTF-8).  If the server didn't declare a charset in the
 37      # Content-Type header, force UTF-8 which is correct for virtually all
 38      # modern websites.
 39      if "charset" not in resp.headers.get("Content-Type", ""):
 40          resp.encoding = "utf-8"
 41      return resp.text
 42  
 43  
 44  def extract_content(raw_html, url):
 45      """Use readability to pull out the main article content.
 46  
 47      Returns (title, article_html).
 48      """
 49      doc = Document(raw_html, url=url)
 50      title = doc.title()
 51      article_html = doc.summary(html_partial=True)
 52      return title, article_html
 53  
 54  
 55  # ---------------------------------------------------------------------------
 56  # HTML cleanup
 57  # ---------------------------------------------------------------------------
 58  
 59  _HAS_INKSCAPE = shutil.which("inkscape") is not None
 60  
 61  
 62  def _rasterize_svg(svg_bytes, output_width=1500):
 63      """Rasterize SVG bytes to a high-res PNG via Inkscape.
 64  
 65      Returns a data:image/png;base64,... URI, or None if rasterization fails.
 66      Inkscape handles complex SVG features (masks, filters, external fonts)
 67      far better than Cairo-based renderers.
 68      """
 69      if not _HAS_INKSCAPE:
 70          return None
 71      tmp_svg = None
 72      tmp_png = None
 73      try:
 74          tmp_svg = tempfile.NamedTemporaryFile(suffix=".svg", delete=False)
 75          tmp_svg.write(svg_bytes)
 76          tmp_svg.close()
 77          tmp_png = tmp_svg.name.replace(".svg", ".png")
 78          subprocess.run(
 79              ["inkscape", tmp_svg.name,
 80               "--export-type=png",
 81               f"--export-filename={tmp_png}",
 82               f"--export-width={output_width}"],
 83              capture_output=True, timeout=30,
 84          )
 85          if os.path.isfile(tmp_png) and os.path.getsize(tmp_png) > 0:
 86              with open(tmp_png, "rb") as f:
 87                  encoded = base64.b64encode(f.read()).decode("ascii")
 88              return f"data:image/png;base64,{encoded}"
 89      except Exception:
 90          pass
 91      finally:
 92          for p in (tmp_svg and tmp_svg.name, tmp_png):
 93              if p and os.path.exists(p):
 94                  os.unlink(p)
 95      return None
 96  
 97  
 98  def _make_svg_scalable(svg_bytes):
 99      """Strip fixed width/height from an SVG so it scales to its container.
100  
101      Preserves the viewBox so the aspect ratio is maintained.
102      """
103      svg_text = svg_bytes.decode("utf-8", errors="replace")
104      def strip_root_dims(m):
105          tag = m.group(0)
106          tag = re.sub(r'\s+width\s*=\s*"[^"]*"', '', tag)
107          tag = re.sub(r"\s+width\s*=\s*'[^']*'", '', tag)
108          tag = re.sub(r'\s+height\s*=\s*"[^"]*"', '', tag)
109          tag = re.sub(r"\s+height\s*=\s*'[^']*'", '', tag)
110          tag = tag.replace("<svg", '<svg width="100%"', 1)
111          return tag
112      svg_text = re.sub(r'<svg[^>]*>', strip_root_dims, svg_text, count=1)
113      return svg_text.encode("utf-8")
114  
115  
116  def _fetch_and_encode_svg(svg_url):
117      """Download an SVG and return a data URI (PNG raster or scalable SVG)."""
118      try:
119          resp = requests.get(svg_url, timeout=15)
120          resp.raise_for_status()
121          svg_bytes = resp.content
122  
123          # Try Inkscape rasterization first (handles masks, filters, etc.)
124          png_uri = _rasterize_svg(svg_bytes)
125          if png_uri:
126              return png_uri
127  
128          # Fallback: embed as scalable SVG data URI
129          svg_bytes = _make_svg_scalable(svg_bytes)
130          encoded = base64.b64encode(svg_bytes).decode("ascii")
131          return f"data:image/svg+xml;base64,{encoded}"
132      except Exception:
133          return svg_url
134  
135  
136  def clean_html(article_html, url):
137      """Normalize the extracted HTML for print rendering."""
138      soup = BeautifulSoup(article_html, "lxml")
139  
140      # Remove comments
141      for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
142          comment.extract()
143  
144      # Remove scripts, styles, iframes, forms
145      for tag in soup.find_all(["script", "style", "iframe", "form", "nav",
146                                "button", "input"]):
147          tag.decompose()
148  
149      # Convert <object type="image/svg+xml"> to <img> tags.
150      # WeasyPrint renders <img src="*.svg"> reliably but not <object>.
151      for obj in soup.find_all("object"):
152          obj_type = (obj.get("type") or "").lower()
153          data_url = obj.get("data")
154          if "svg" in obj_type and data_url:
155              abs_url = urljoin(url, data_url) if not data_url.startswith(("http://", "https://")) else data_url
156              print(f"  Fetching SVG: {abs_url}")
157              src = _fetch_and_encode_svg(abs_url)
158              img_tag = soup.new_tag("img", src=src)
159              img_tag["alt"] = obj.get("aria-label", "diagram")
160              obj.replace_with(img_tag)
161          else:
162              obj.decompose()
163  
164      # Convert any inline <svg> elements to data-URI <img> tags so WeasyPrint
165      # can render them with proper sizing constraints.
166      for svg_tag in soup.find_all("svg"):
167          svg_html = str(svg_tag)
168          svg_scalable = _make_svg_scalable(svg_html.encode("utf-8"))
169          encoded = base64.b64encode(svg_scalable).decode("ascii")
170          src = f"data:image/svg+xml;base64,{encoded}"
171          img_tag = soup.new_tag("img", src=src)
172          img_tag["alt"] = "diagram"
173          svg_tag.replace_with(img_tag)
174  
175      # Resolve relative URLs for images and links
176      for img in soup.find_all("img"):
177          src = img.get("src")
178          if src and not src.startswith(("http://", "https://", "data:")):
179              img["src"] = urljoin(url, src)
180  
181      for a in soup.find_all("a"):
182          href = a.get("href")
183          if href and not href.startswith(("http://", "https://", "mailto:", "#")):
184              a["href"] = urljoin(url, href)
185  
186      # Remove empty paragraphs
187      for p in soup.find_all("p"):
188          if not p.get_text(strip=True) and not p.find("img"):
189              p.decompose()
190  
191      return soup
192  
193  
194  # ---------------------------------------------------------------------------
195  # Document assembly
196  # ---------------------------------------------------------------------------
197  
198  def build_document(title, content_soup, url, css_path):
199      """Wrap extracted content in a full HTML document with print CSS."""
200      with open(css_path, "r") as f:
201          css = f.read()
202  
203      domain = urlparse(url).netloc
204  
205      # Get the inner HTML from the content soup
206      # (content_soup may be a full document from lxml; grab the body contents)
207      body_tag = content_soup.find("body")
208      inner = "".join(str(c) for c in body_tag.children) if body_tag else str(content_soup)
209  
210      html = f"""<!DOCTYPE html>
211  <html lang="en">
212  <head>
213  <meta charset="utf-8">
214  <title>{title}</title>
215  <style>
216  {css}
217  </style>
218  </head>
219  <body>
220  
221  <!-- Running strings for page headers -->
222  <span class="ppp-title">{title}</span>
223  <span class="ppp-source">{domain}</span>
224  
225  <!-- Title page -->
226  <div class="ppp-header">
227    <h1>{title}</h1>
228    <div class="ppp-meta">
229      Source: {url}<br>
230      Formatted for print by PPP
231    </div>
232  </div>
233  
234  <!-- Content -->
235  <article>
236  {inner}
237  </article>
238  
239  </body>
240  </html>"""
241  
242      return html
243  
244  
245  # ---------------------------------------------------------------------------
246  # PDF rendering
247  # ---------------------------------------------------------------------------
248  
249  def render_pdf(html_string, output_path, base_url=None):
250      """Render an HTML string to PDF via WeasyPrint."""
251      html_doc = HTML(string=html_string, base_url=base_url)
252      html_doc.write_pdf(output_path)
253  
254  
255  def convert_to_grayscale(input_path, output_path):
256      """Convert a PDF to grayscale using Ghostscript."""
257      tmp_out = output_path + ".gray.tmp"
258      subprocess.run(
259          ["gs", "-sDEVICE=pdfwrite", "-dNOPAUSE", "-dBATCH", "-dQUIET",
260           "-sColorConversionStrategy=Gray",
261           "-dProcessColorModel=/DeviceGray",
262           f"-sOutputFile={tmp_out}", input_path],
263          check=True, timeout=120,
264      )
265      os.replace(tmp_out, output_path)
266  
267  
268  # ---------------------------------------------------------------------------
269  # CLI
270  # ---------------------------------------------------------------------------
271  
272  def default_output_name(url):
273      """Derive a reasonable output filename from the URL."""
274      parsed = urlparse(url)
275      # Use the path, stripping leading/trailing slashes
276      path = parsed.path.strip("/")
277      if not path:
278          path = parsed.netloc
279      # Replace slashes and dots with hyphens, keep it clean
280      name = re.sub(r"[/\.\s]+", "-", path)
281      name = re.sub(r"[^a-zA-Z0-9\-]", "", name)
282      name = name.strip("-")[:60] or "output"
283      return f"PPP-{name}.pdf"
284  
285  
286  PAPER_SIZES = {
287      "letter": "letter",
288      "a4": "A4",
289      "a5": "A5",
290      "legal": "legal",
291  }
292  
293  
294  def main():
295      parser = argparse.ArgumentParser(
296          prog="ppp-web",
297          description="Fetch a webpage and render it as a print-optimized PDF.",
298      )
299      parser.add_argument("url", help="URL of the webpage to format")
300      parser.add_argument("-o", "--output", help="Output PDF filename")
301      parser.add_argument(
302          "--paper", choices=list(PAPER_SIZES.keys()), default="letter",
303          help="Paper size (default: letter)",
304      )
305      parser.add_argument(
306          "--font-size", type=float, default=11,
307          help="Base font size in points (default: 11)",
308      )
309      parser.add_argument(
310          "--no-title-page", action="store_true",
311          help="Skip the title page",
312      )
313      parser.add_argument(
314          "--grayscale", action="store_true",
315          help="Convert output to grayscale (saves ink)",
316      )
317  
318      args = parser.parse_args()
319  
320      # Ensure URL has a scheme
321      url = args.url
322      if not url.startswith(("http://", "https://")):
323          url = "https://" + url
324  
325      output = args.output or default_output_name(url)
326  
327      # Locate the print stylesheet
328      css_path = get_data_path("print.css")
329      if css_path is None:
330          print("Error: could not locate print.css stylesheet", file=sys.stderr)
331          sys.exit(1)
332  
333      print(f"Fetching {url} ...")
334      raw_html = fetch_page(url)
335  
336      print("Extracting content ...")
337      title, article_html = extract_content(raw_html, url)
338      print(f"  Title: {title}")
339  
340      print("Cleaning up ...")
341      content_soup = clean_html(article_html, url)
342  
343      print("Building document ...")
344      # Read CSS and apply overrides
345      with open(css_path, "r") as f:
346          css_text = f.read()
347  
348      # Apply paper size override
349      if args.paper != "letter":
350          css_text = css_text.replace("size: letter;", f"size: {PAPER_SIZES[args.paper]};")
351  
352      # Apply font size override
353      if args.font_size != 11:
354          css_text = css_text.replace("font-size: 11pt;", f"font-size: {args.font_size}pt;")
355  
356      # Write a temp CSS with overrides so build_document can read it
357      import tempfile
358      tmp_css = tempfile.NamedTemporaryFile(mode="w", suffix=".css", delete=False)
359      tmp_css.write(css_text)
360      tmp_css.close()
361  
362      try:
363          html_doc = build_document(title, content_soup, url, tmp_css.name)
364  
365          # Strip title page if requested
366          if args.no_title_page:
367              html_doc = html_doc.replace(
368                  'class="ppp-header"', 'class="ppp-header" style="display:none"'
369              )
370  
371          print(f"Rendering PDF ...")
372          render_pdf(html_doc, output, base_url=url)
373      finally:
374          os.unlink(tmp_css.name)
375  
376      if args.grayscale:
377          print("Converting to grayscale ...")
378          convert_to_grayscale(output, output)
379  
380      print(f"Done! Output: {output}")
381  
382  
383  if __name__ == "__main__":
384      main()