/ integrations / html_to_notion.py
html_to_notion.py
1 """Convert LinkedIn HTML job descriptions to Notion rich text blocks.""" 2 3 __all__ = ["html_to_notion_blocks"] 4 5 import re 6 from typing import Any 7 8 from bs4 import BeautifulSoup, NavigableString, PageElement, Tag 9 10 _MAX_BLOCKS = 100 11 # Notion API limit for children blocks in a single pages.create call. 12 13 _RICH_TEXT_MAX_LENGTH = 2000 14 # Notion API limit per rich text element content string. 15 16 _LINKEDIN_CONTENT_CLASS = "show-more-less-html__markup" 17 # CSS class for the inner content div in LinkedIn's show-more-less-html section. 18 19 _SKIP_TAGS = frozenset({"button", "img", "script", "style"}) 20 # Tags whose entire subtree should be silently ignored. 21 22 _TRANSPARENT_TAGS = frozenset({"section", "div", "span", "header", "footer", "main", "article"}) 23 # Container tags that produce no block of their own — children are processed directly. 24 25 _INLINE_TAGS = frozenset( 26 {"strong", "b", "em", "i", "a", "span", "code", "sub", "sup", "u", "s", "br"} 27 ) 28 # Tags that participate in inline (rich text) rendering when encountered at block level. 29 30 31 def html_to_notion_blocks(html: str) -> list[dict[str, Any]]: 32 """Convert a LinkedIn HTML job description to a list of Notion block objects. 33 34 Handles the most common LinkedIn formatting: 35 - ``<p>`` → paragraph block 36 - ``<p><br></p>`` → empty paragraph block 37 - ``<ul>`` / ``<li>`` → bulleted_list_item blocks 38 - ``<strong>`` / ``<b>`` → bold annotation 39 - ``<em>`` / ``<i>`` → italic annotation 40 - ``<a href="…">`` → inline link via ``text.link.url`` 41 - ``<section>``, ``<div>`` → transparent containers (children processed) 42 - ``<button>``, ``<img>``, ``<script>``, ``<style>`` → skipped silently 43 - ``<table>`` → cell text flattened to plain paragraphs 44 45 Note: 46 Nested lists are flattened to top-level ``bulleted_list_item`` blocks. 47 Notion nested list support requires recursive ``children`` blocks, which 48 adds complexity and is deferred. 49 50 Args: 51 html: HTML string from BrightData's ``job_description_formatted`` field. 52 53 Returns: 54 List of Notion block dicts, truncated to ``_MAX_BLOCKS`` items. 55 """ 56 if not html: 57 return [] 58 59 soup = BeautifulSoup(html, "html.parser") 60 61 # LinkedIn wraps content in a specific inner div — prefer it if present. 62 inner = soup.find("div", class_=_LINKEDIN_CONTENT_CLASS) 63 root: Tag = inner if isinstance(inner, Tag) else soup 64 65 blocks: list[dict[str, Any]] = [] 66 _emit_blocks(root, blocks) 67 return blocks[:_MAX_BLOCKS] 68 69 70 # ── Block-level processing ──────────────────────────────────────────────────── 71 72 73 def _emit_blocks(parent: Tag, blocks: list[dict[str, Any]]) -> None: 74 """Walk direct children of parent and append block objects to blocks. 75 76 Consecutive inline children (text nodes, ``<strong>``, ``<em>``, ``<a>``, 77 etc.) are buffered into a single run and emitted together as one paragraph. 78 79 Two rules for splitting the run into separate paragraphs: 80 81 1. **``<br><br>`` paragraph separator**: LinkedIn flat-HTML uses ``<br><br>`` 82 instead of ``<p>`` tags to separate paragraphs. Two consecutive ``<br>`` 83 tags flush the current run. A single ``<br>`` is treated as a soft break 84 within the same paragraph (Notion has no inline line-break, so it produces 85 no output). 86 2. **Block-level section header**: a lone ``<strong>``/``<b>`` with no 87 surrounding text is emitted as ``heading_3``. A ``<strong>``/``<b>`` 88 that itself *contains* a ``<br>`` child is also a section header 89 (LinkedIn flat-HTML pattern). ``<em>``/``<i>`` are always italic 90 emphasis — they are never promoted to headings. 91 92 Args: 93 parent: BeautifulSoup Tag whose children to process. 94 blocks: Mutable list accumulating Notion block dicts. 95 """ 96 run: list[PageElement] = [] 97 consecutive_br = 0 98 99 def _flush_run() -> None: 100 if not run: 101 return 102 significant = [ 103 c for c in run 104 if not (isinstance(c, NavigableString) and not str(c).strip()) 105 ] 106 if ( 107 len(significant) == 1 108 and isinstance(significant[0], Tag) 109 and significant[0].name in {"strong", "b"} 110 ): 111 # Lone bold tag at block level → section header. 112 # em/i are italic emphasis — never section headers on LinkedIn. 113 lone: Tag = significant[0] 114 raw = [c for c in _make_rich_text(lone) if c["text"]["content"].strip()] 115 rich_text = [{k: v for k, v in c.items() if k != "annotations"} for c in raw] 116 if rich_text: 117 blocks.append(_heading_3(rich_text)) 118 else: 119 parts: list[dict[str, Any]] = [] 120 for node in run: 121 _collect_inline( 122 node, parts, 123 bold=False, italic=False, link_url=None, skip_lists=False, 124 ) 125 rich_text = _strip_rich_text_edges(_split_chunks(parts)) 126 joined = "".join(c["text"]["content"] for c in rich_text) 127 if joined.strip(): 128 blocks.append(_paragraph(rich_text)) 129 run.clear() 130 131 for child in parent.children: 132 if isinstance(child, NavigableString) or ( 133 isinstance(child, Tag) and child.name in _INLINE_TAGS 134 ): 135 if isinstance(child, Tag) and child.name == "br": 136 consecutive_br += 1 137 if consecutive_br >= 2: 138 # <br><br> is LinkedIn's paragraph separator in flat HTML. 139 _flush_run() 140 consecutive_br = 0 141 # <br> is never added to run — Notion has no inline line break. 142 elif ( 143 isinstance(child, Tag) 144 and child.name in {"strong", "b"} 145 and child.find("br") is not None 146 ): 147 # <strong>Header<br><br></strong>: flat-HTML section header pattern. 148 # em/i with <br> inside are italic paragraphs, not headers. 149 consecutive_br = 0 150 _flush_run() 151 raw = [c for c in _make_rich_text(child) if c["text"]["content"].strip()] 152 rich_text = [{k: v for k, v in c.items() if k != "annotations"} for c in raw] 153 if rich_text: 154 blocks.append(_heading_3(rich_text)) 155 else: 156 if isinstance(child, NavigableString) and not str(child).strip(): 157 # Whitespace-only text between <br> tags — skip, but preserve 158 # the consecutive_br counter so the separator is still detected. 159 pass 160 else: 161 consecutive_br = 0 162 run.append(child) 163 elif isinstance(child, Tag): 164 consecutive_br = 0 165 _flush_run() 166 _tag_to_blocks(child, blocks) 167 _flush_run() 168 169 170 def _tag_to_blocks(tag: Tag, blocks: list[dict[str, Any]]) -> None: 171 """Convert a single HTML tag to one or more Notion block objects. 172 173 Args: 174 tag: BeautifulSoup Tag to convert. 175 blocks: Mutable list accumulating Notion block dicts. 176 """ 177 if tag.name in _SKIP_TAGS: 178 return 179 180 if tag.name == "p": 181 _p_to_block(tag, blocks) 182 elif tag.name in {"ul", "ol"}: 183 _list_to_blocks(tag, blocks) 184 elif tag.name == "table": 185 _table_to_blocks(tag, blocks) 186 elif tag.name in _TRANSPARENT_TAGS: 187 _emit_blocks(tag, blocks) 188 elif tag.name in {"h1", "h2", "h3", "h4", "h5", "h6"}: 189 rich_text = _make_rich_text(tag) 190 if rich_text: 191 blocks.append(_paragraph(rich_text)) 192 elif tag.name == "br": 193 return # bare <br> outside inline run — no output needed 194 else: 195 _emit_blocks(tag, blocks) 196 197 198 def _p_to_block(tag: Tag, blocks: list[dict[str, Any]]) -> None: 199 """Convert a ``<p>`` tag to a paragraph block. 200 201 ``<p><br></p>`` produces an empty paragraph. 202 203 Args: 204 tag: A ``<p>`` BeautifulSoup Tag. 205 blocks: Mutable list accumulating Notion block dicts. 206 """ 207 significant = [ 208 c for c in tag.children 209 if not (isinstance(c, NavigableString) and not str(c).strip()) 210 ] 211 if ( 212 len(significant) == 1 213 and isinstance(significant[0], Tag) 214 and significant[0].name == "br" 215 ): 216 return # <p><br></p> — block spacing already provides visual separation 217 blocks.append(_paragraph(_make_rich_text(tag))) 218 219 220 def _list_to_blocks(tag: Tag, blocks: list[dict[str, Any]]) -> None: 221 """Convert a ``<ul>`` or ``<ol>`` tag to bulleted_list_item blocks. 222 223 Nested lists are flattened: inner ``<ul>`` items become sibling 224 ``bulleted_list_item`` blocks at the same level. 225 226 Args: 227 tag: A ``<ul>`` or ``<ol>`` BeautifulSoup Tag. 228 blocks: Mutable list accumulating Notion block dicts. 229 """ 230 for child in tag.children: 231 if not (isinstance(child, Tag) and child.name == "li"): 232 continue 233 rich_text = _make_rich_text(child, skip_lists=True) 234 if rich_text: 235 blocks.append(_bulleted_list_item(rich_text)) 236 for nested in child.children: 237 if isinstance(nested, Tag) and nested.name in {"ul", "ol"}: 238 _list_to_blocks(nested, blocks) 239 240 241 def _table_to_blocks(tag: Tag, blocks: list[dict[str, Any]]) -> None: 242 """Flatten table cell text to paragraph blocks. 243 244 Args: 245 tag: A ``<table>`` BeautifulSoup Tag. 246 blocks: Mutable list accumulating Notion block dicts. 247 """ 248 for cell in tag.find_all(["td", "th"]): 249 rich_text = _make_rich_text(cell) 250 if rich_text: 251 blocks.append(_paragraph(rich_text)) 252 253 254 # ── Inline (rich text) processing ──────────────────────────────────────────── 255 256 257 def _make_rich_text(tag: Tag, *, skip_lists: bool = False) -> list[dict[str, Any]]: 258 """Extract a list of Notion rich text objects from a tag's inline content. 259 260 Args: 261 tag: BeautifulSoup Tag to extract inline content from. 262 skip_lists: If True, skip ``<ul>``/``<ol>`` subtrees (used for ``<li>`` 263 items to avoid double-processing nested lists). 264 265 Returns: 266 List of Notion rich text dicts, chunks split to fit ``_RICH_TEXT_MAX_LENGTH``. 267 """ 268 parts: list[dict[str, Any]] = [] 269 _collect_inline(tag, parts, bold=False, italic=False, link_url=None, skip_lists=skip_lists) 270 return _strip_rich_text_edges(_split_chunks(parts)) 271 272 273 def _collect_inline( 274 node: PageElement, 275 parts: list[dict[str, Any]], 276 *, 277 bold: bool, 278 italic: bool, 279 link_url: str | None, 280 skip_lists: bool, 281 ) -> None: 282 """Recursively collect inline Notion rich text parts from a BeautifulSoup node. 283 284 Args: 285 node: A Tag or NavigableString to process. 286 parts: Mutable list accumulating raw rich text chunk dicts. 287 bold: Whether bold annotation is currently active. 288 italic: Whether italic annotation is currently active. 289 link_url: Active hyperlink URL, if any. 290 skip_lists: Skip ``<ul>``/``<ol>`` subtrees when True. 291 """ 292 if isinstance(node, NavigableString): 293 # Normalize HTML source whitespace: replace newlines/tabs with spaces and 294 # collapse consecutive spaces — mirrors how browsers render inline text nodes. 295 # Single-space nodes (e.g. "<strong>Bold</strong> <em>Italic</em>") are 296 # preserved as inter-element separators; _strip_rich_text_edges handles 297 # any leading/trailing space that remains at paragraph boundaries. 298 text = re.sub(r"\s+", " ", str(node)) 299 if text: 300 parts.append(_text_chunk(text, bold=bold, italic=italic, link_url=link_url)) 301 return 302 303 if not isinstance(node, Tag): 304 return 305 306 if node.name in _SKIP_TAGS: 307 return 308 if skip_lists and node.name in {"ul", "ol"}: 309 return 310 311 new_bold = bold or node.name in {"strong", "b"} 312 new_italic = italic or node.name in {"em", "i"} 313 new_link = link_url 314 315 if node.name == "a": 316 href = node.get("href") 317 if isinstance(href, str): 318 new_link = href 319 elif isinstance(href, list) and href: 320 new_link = href[0] 321 322 if node.name == "br": 323 return # inline <br> produces no output — Notion blocks do not support inline line breaks 324 325 for child in node.children: 326 _collect_inline( 327 child, parts, 328 bold=new_bold, italic=new_italic, link_url=new_link, skip_lists=skip_lists, 329 ) 330 331 332 def _split_chunks(parts: list[dict[str, Any]]) -> list[dict[str, Any]]: 333 """Split rich text parts whose content exceeds ``_RICH_TEXT_MAX_LENGTH``. 334 335 Args: 336 parts: Raw list of Notion rich text chunk dicts. 337 338 Returns: 339 List where every content string is at most ``_RICH_TEXT_MAX_LENGTH`` chars. 340 """ 341 result: list[dict[str, Any]] = [] 342 for part in parts: 343 content: str = part["text"]["content"] 344 if len(content) <= _RICH_TEXT_MAX_LENGTH: 345 result.append(part) 346 else: 347 for i in range(0, len(content), _RICH_TEXT_MAX_LENGTH): 348 chunk_text = content[i : i + _RICH_TEXT_MAX_LENGTH] 349 result.append({**part, "text": {**part["text"], "content": chunk_text}}) 350 return result 351 352 353 def _strip_rich_text_edges(chunks: list[dict[str, Any]]) -> list[dict[str, Any]]: 354 """Strip leading whitespace from the first chunk and trailing from the last. 355 356 HTML source indentation collapses to a single leading/trailing space after 357 whitespace normalization. Stripping at the paragraph boundary (rather than 358 per-chunk) preserves intentional inter-word spaces between inline elements. 359 360 Args: 361 chunks: Notion rich text chunk list. 362 363 Returns: 364 Same list with edge chunks trimmed; empty chunks are removed. 365 """ 366 if not chunks: 367 return chunks 368 result = list(chunks) 369 first = result[0] 370 stripped = first["text"]["content"].lstrip() 371 result[0] = {**first, "text": {**first["text"], "content": stripped}} 372 last = result[-1] 373 stripped = last["text"]["content"].rstrip() 374 result[-1] = {**last, "text": {**last["text"], "content": stripped}} 375 return [c for c in result if c["text"]["content"]] 376 377 378 # ── Notion block / rich text constructors ──────────────────────────────────── 379 380 381 def _text_chunk( 382 content: str, 383 *, 384 bold: bool = False, 385 italic: bool = False, 386 link_url: str | None = None, 387 ) -> dict[str, Any]: 388 """Build a Notion rich text ``text`` object. 389 390 Args: 391 content: Text string. 392 bold: Apply bold annotation. 393 italic: Apply italic annotation. 394 link_url: If set, attach as an inline hyperlink. 395 396 Returns: 397 Notion rich text dict. 398 """ 399 text: dict[str, Any] = {"content": content} 400 if link_url: 401 text["link"] = {"url": link_url} 402 chunk: dict[str, Any] = {"type": "text", "text": text} 403 if bold or italic: 404 chunk["annotations"] = {"bold": bold, "italic": italic} 405 return chunk 406 407 408 def _paragraph(rich_text: list[dict[str, Any]]) -> dict[str, Any]: 409 return {"type": "paragraph", "paragraph": {"rich_text": rich_text}} 410 411 412 def _heading_3(rich_text: list[dict[str, Any]]) -> dict[str, Any]: 413 return {"type": "heading_3", "heading_3": {"rich_text": rich_text}} 414 415 416 def _bulleted_list_item(rich_text: list[dict[str, Any]]) -> dict[str, Any]: 417 return {"type": "bulleted_list_item", "bulleted_list_item": {"rich_text": rich_text}}