Cradicle Explorer

/ integrations / html_to_notion.py
html_to_notion.py
  1  """Convert LinkedIn HTML job descriptions to Notion rich text blocks."""
  2  
  3  __all__ = ["html_to_notion_blocks"]
  4  
  5  import re
  6  from typing import Any
  7  
  8  from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
  9  
 10  _MAX_BLOCKS = 100
 11  # Notion API limit for children blocks in a single pages.create call.
 12  
 13  _RICH_TEXT_MAX_LENGTH = 2000
 14  # Notion API limit per rich text element content string.
 15  
 16  _LINKEDIN_CONTENT_CLASS = "show-more-less-html__markup"
 17  # CSS class for the inner content div in LinkedIn's show-more-less-html section.
 18  
 19  _SKIP_TAGS = frozenset({"button", "img", "script", "style"})
 20  # Tags whose entire subtree should be silently ignored.
 21  
 22  _TRANSPARENT_TAGS = frozenset({"section", "div", "span", "header", "footer", "main", "article"})
 23  # Container tags that produce no block of their own — children are processed directly.
 24  
 25  _INLINE_TAGS = frozenset(
 26      {"strong", "b", "em", "i", "a", "span", "code", "sub", "sup", "u", "s", "br"}
 27  )
 28  # Tags that participate in inline (rich text) rendering when encountered at block level.
 29  
 30  
 31  def html_to_notion_blocks(html: str) -> list[dict[str, Any]]:
 32      """Convert a LinkedIn HTML job description to a list of Notion block objects.
 33  
 34      Handles the most common LinkedIn formatting:
 35      - ``<p>`` → paragraph block
 36      - ``<p><br></p>`` → empty paragraph block
 37      - ``<ul>`` / ``<li>`` → bulleted_list_item blocks
 38      - ``<strong>`` / ``<b>`` → bold annotation
 39      - ``<em>`` / ``<i>`` → italic annotation
 40      - ``<a href="…">`` → inline link via ``text.link.url``
 41      - ``<section>``, ``<div>`` → transparent containers (children processed)
 42      - ``<button>``, ``<img>``, ``<script>``, ``<style>`` → skipped silently
 43      - ``<table>`` → cell text flattened to plain paragraphs
 44  
 45      Note:
 46          Nested lists are flattened to top-level ``bulleted_list_item`` blocks.
 47          Notion nested list support requires recursive ``children`` blocks, which
 48          adds complexity and is deferred.
 49  
 50      Args:
 51          html: HTML string from BrightData's ``job_description_formatted`` field.
 52  
 53      Returns:
 54          List of Notion block dicts, truncated to ``_MAX_BLOCKS`` items.
 55      """
 56      if not html:
 57          return []
 58  
 59      soup = BeautifulSoup(html, "html.parser")
 60  
 61      # LinkedIn wraps content in a specific inner div — prefer it if present.
 62      inner = soup.find("div", class_=_LINKEDIN_CONTENT_CLASS)
 63      root: Tag = inner if isinstance(inner, Tag) else soup
 64  
 65      blocks: list[dict[str, Any]] = []
 66      _emit_blocks(root, blocks)
 67      return blocks[:_MAX_BLOCKS]
 68  
 69  
 70  # ── Block-level processing ────────────────────────────────────────────────────
 71  
 72  
 73  def _emit_blocks(parent: Tag, blocks: list[dict[str, Any]]) -> None:
 74      """Walk direct children of parent and append block objects to blocks.
 75  
 76      Consecutive inline children (text nodes, ``<strong>``, ``<em>``, ``<a>``,
 77      etc.) are buffered into a single run and emitted together as one paragraph.
 78  
 79      Two rules for splitting the run into separate paragraphs:
 80  
 81      1. **``<br><br>`` paragraph separator**: LinkedIn flat-HTML uses ``<br><br>``
 82         instead of ``<p>`` tags to separate paragraphs.  Two consecutive ``<br>``
 83         tags flush the current run.  A single ``<br>`` is treated as a soft break
 84         within the same paragraph (Notion has no inline line-break, so it produces
 85         no output).
 86      2. **Block-level section header**: a lone ``<strong>``/``<b>`` with no
 87         surrounding text is emitted as ``heading_3``.  A ``<strong>``/``<b>``
 88         that itself *contains* a ``<br>`` child is also a section header
 89         (LinkedIn flat-HTML pattern).  ``<em>``/``<i>`` are always italic
 90         emphasis — they are never promoted to headings.
 91  
 92      Args:
 93          parent: BeautifulSoup Tag whose children to process.
 94          blocks: Mutable list accumulating Notion block dicts.
 95      """
 96      run: list[PageElement] = []
 97      consecutive_br = 0
 98  
 99      def _flush_run() -> None:
100          if not run:
101              return
102          significant = [
103              c for c in run
104              if not (isinstance(c, NavigableString) and not str(c).strip())
105          ]
106          if (
107              len(significant) == 1
108              and isinstance(significant[0], Tag)
109              and significant[0].name in {"strong", "b"}
110          ):
111              # Lone bold tag at block level → section header.
112              # em/i are italic emphasis — never section headers on LinkedIn.
113              lone: Tag = significant[0]
114              raw = [c for c in _make_rich_text(lone) if c["text"]["content"].strip()]
115              rich_text = [{k: v for k, v in c.items() if k != "annotations"} for c in raw]
116              if rich_text:
117                  blocks.append(_heading_3(rich_text))
118          else:
119              parts: list[dict[str, Any]] = []
120              for node in run:
121                  _collect_inline(
122                      node, parts,
123                      bold=False, italic=False, link_url=None, skip_lists=False,
124                  )
125              rich_text = _strip_rich_text_edges(_split_chunks(parts))
126              joined = "".join(c["text"]["content"] for c in rich_text)
127              if joined.strip():
128                  blocks.append(_paragraph(rich_text))
129          run.clear()
130  
131      for child in parent.children:
132          if isinstance(child, NavigableString) or (
133              isinstance(child, Tag) and child.name in _INLINE_TAGS
134          ):
135              if isinstance(child, Tag) and child.name == "br":
136                  consecutive_br += 1
137                  if consecutive_br >= 2:
138                      # <br><br> is LinkedIn's paragraph separator in flat HTML.
139                      _flush_run()
140                      consecutive_br = 0
141                  # <br> is never added to run — Notion has no inline line break.
142              elif (
143                  isinstance(child, Tag)
144                  and child.name in {"strong", "b"}
145                  and child.find("br") is not None
146              ):
147                  # <strong>Header<br><br></strong>: flat-HTML section header pattern.
148                  # em/i with <br> inside are italic paragraphs, not headers.
149                  consecutive_br = 0
150                  _flush_run()
151                  raw = [c for c in _make_rich_text(child) if c["text"]["content"].strip()]
152                  rich_text = [{k: v for k, v in c.items() if k != "annotations"} for c in raw]
153                  if rich_text:
154                      blocks.append(_heading_3(rich_text))
155              else:
156                  if isinstance(child, NavigableString) and not str(child).strip():
157                      # Whitespace-only text between <br> tags — skip, but preserve
158                      # the consecutive_br counter so the separator is still detected.
159                      pass
160                  else:
161                      consecutive_br = 0
162                  run.append(child)
163          elif isinstance(child, Tag):
164              consecutive_br = 0
165              _flush_run()
166              _tag_to_blocks(child, blocks)
167      _flush_run()
168  
169  
170  def _tag_to_blocks(tag: Tag, blocks: list[dict[str, Any]]) -> None:
171      """Convert a single HTML tag to one or more Notion block objects.
172  
173      Args:
174          tag: BeautifulSoup Tag to convert.
175          blocks: Mutable list accumulating Notion block dicts.
176      """
177      if tag.name in _SKIP_TAGS:
178          return
179  
180      if tag.name == "p":
181          _p_to_block(tag, blocks)
182      elif tag.name in {"ul", "ol"}:
183          _list_to_blocks(tag, blocks)
184      elif tag.name == "table":
185          _table_to_blocks(tag, blocks)
186      elif tag.name in _TRANSPARENT_TAGS:
187          _emit_blocks(tag, blocks)
188      elif tag.name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
189          rich_text = _make_rich_text(tag)
190          if rich_text:
191              blocks.append(_paragraph(rich_text))
192      elif tag.name == "br":
193          return  # bare <br> outside inline run — no output needed
194      else:
195          _emit_blocks(tag, blocks)
196  
197  
198  def _p_to_block(tag: Tag, blocks: list[dict[str, Any]]) -> None:
199      """Convert a ``<p>`` tag to a paragraph block.
200  
201      ``<p><br></p>`` produces an empty paragraph.
202  
203      Args:
204          tag: A ``<p>`` BeautifulSoup Tag.
205          blocks: Mutable list accumulating Notion block dicts.
206      """
207      significant = [
208          c for c in tag.children
209          if not (isinstance(c, NavigableString) and not str(c).strip())
210      ]
211      if (
212          len(significant) == 1
213          and isinstance(significant[0], Tag)
214          and significant[0].name == "br"
215      ):
216          return  # <p><br></p> — block spacing already provides visual separation
217      blocks.append(_paragraph(_make_rich_text(tag)))
218  
219  
220  def _list_to_blocks(tag: Tag, blocks: list[dict[str, Any]]) -> None:
221      """Convert a ``<ul>`` or ``<ol>`` tag to bulleted_list_item blocks.
222  
223      Nested lists are flattened: inner ``<ul>`` items become sibling
224      ``bulleted_list_item`` blocks at the same level.
225  
226      Args:
227          tag: A ``<ul>`` or ``<ol>`` BeautifulSoup Tag.
228          blocks: Mutable list accumulating Notion block dicts.
229      """
230      for child in tag.children:
231          if not (isinstance(child, Tag) and child.name == "li"):
232              continue
233          rich_text = _make_rich_text(child, skip_lists=True)
234          if rich_text:
235              blocks.append(_bulleted_list_item(rich_text))
236          for nested in child.children:
237              if isinstance(nested, Tag) and nested.name in {"ul", "ol"}:
238                  _list_to_blocks(nested, blocks)
239  
240  
241  def _table_to_blocks(tag: Tag, blocks: list[dict[str, Any]]) -> None:
242      """Flatten table cell text to paragraph blocks.
243  
244      Args:
245          tag: A ``<table>`` BeautifulSoup Tag.
246          blocks: Mutable list accumulating Notion block dicts.
247      """
248      for cell in tag.find_all(["td", "th"]):
249          rich_text = _make_rich_text(cell)
250          if rich_text:
251              blocks.append(_paragraph(rich_text))
252  
253  
254  # ── Inline (rich text) processing ────────────────────────────────────────────
255  
256  
257  def _make_rich_text(tag: Tag, *, skip_lists: bool = False) -> list[dict[str, Any]]:
258      """Extract a list of Notion rich text objects from a tag's inline content.
259  
260      Args:
261          tag: BeautifulSoup Tag to extract inline content from.
262          skip_lists: If True, skip ``<ul>``/``<ol>`` subtrees (used for ``<li>``
263              items to avoid double-processing nested lists).
264  
265      Returns:
266          List of Notion rich text dicts, chunks split to fit ``_RICH_TEXT_MAX_LENGTH``.
267      """
268      parts: list[dict[str, Any]] = []
269      _collect_inline(tag, parts, bold=False, italic=False, link_url=None, skip_lists=skip_lists)
270      return _strip_rich_text_edges(_split_chunks(parts))
271  
272  
273  def _collect_inline(
274      node: PageElement,
275      parts: list[dict[str, Any]],
276      *,
277      bold: bool,
278      italic: bool,
279      link_url: str | None,
280      skip_lists: bool,
281  ) -> None:
282      """Recursively collect inline Notion rich text parts from a BeautifulSoup node.
283  
284      Args:
285          node: A Tag or NavigableString to process.
286          parts: Mutable list accumulating raw rich text chunk dicts.
287          bold: Whether bold annotation is currently active.
288          italic: Whether italic annotation is currently active.
289          link_url: Active hyperlink URL, if any.
290          skip_lists: Skip ``<ul>``/``<ol>`` subtrees when True.
291      """
292      if isinstance(node, NavigableString):
293          # Normalize HTML source whitespace: replace newlines/tabs with spaces and
294          # collapse consecutive spaces — mirrors how browsers render inline text nodes.
295          # Single-space nodes (e.g. "<strong>Bold</strong> <em>Italic</em>") are
296          # preserved as inter-element separators; _strip_rich_text_edges handles
297          # any leading/trailing space that remains at paragraph boundaries.
298          text = re.sub(r"\s+", " ", str(node))
299          if text:
300              parts.append(_text_chunk(text, bold=bold, italic=italic, link_url=link_url))
301          return
302  
303      if not isinstance(node, Tag):
304          return
305  
306      if node.name in _SKIP_TAGS:
307          return
308      if skip_lists and node.name in {"ul", "ol"}:
309          return
310  
311      new_bold = bold or node.name in {"strong", "b"}
312      new_italic = italic or node.name in {"em", "i"}
313      new_link = link_url
314  
315      if node.name == "a":
316          href = node.get("href")
317          if isinstance(href, str):
318              new_link = href
319          elif isinstance(href, list) and href:
320              new_link = href[0]
321  
322      if node.name == "br":
323          return  # inline <br> produces no output — Notion blocks do not support inline line breaks
324  
325      for child in node.children:
326          _collect_inline(
327              child, parts,
328              bold=new_bold, italic=new_italic, link_url=new_link, skip_lists=skip_lists,
329          )
330  
331  
332  def _split_chunks(parts: list[dict[str, Any]]) -> list[dict[str, Any]]:
333      """Split rich text parts whose content exceeds ``_RICH_TEXT_MAX_LENGTH``.
334  
335      Args:
336          parts: Raw list of Notion rich text chunk dicts.
337  
338      Returns:
339          List where every content string is at most ``_RICH_TEXT_MAX_LENGTH`` chars.
340      """
341      result: list[dict[str, Any]] = []
342      for part in parts:
343          content: str = part["text"]["content"]
344          if len(content) <= _RICH_TEXT_MAX_LENGTH:
345              result.append(part)
346          else:
347              for i in range(0, len(content), _RICH_TEXT_MAX_LENGTH):
348                  chunk_text = content[i : i + _RICH_TEXT_MAX_LENGTH]
349                  result.append({**part, "text": {**part["text"], "content": chunk_text}})
350      return result
351  
352  
353  def _strip_rich_text_edges(chunks: list[dict[str, Any]]) -> list[dict[str, Any]]:
354      """Strip leading whitespace from the first chunk and trailing from the last.
355  
356      HTML source indentation collapses to a single leading/trailing space after
357      whitespace normalization.  Stripping at the paragraph boundary (rather than
358      per-chunk) preserves intentional inter-word spaces between inline elements.
359  
360      Args:
361          chunks: Notion rich text chunk list.
362  
363      Returns:
364          Same list with edge chunks trimmed; empty chunks are removed.
365      """
366      if not chunks:
367          return chunks
368      result = list(chunks)
369      first = result[0]
370      stripped = first["text"]["content"].lstrip()
371      result[0] = {**first, "text": {**first["text"], "content": stripped}}
372      last = result[-1]
373      stripped = last["text"]["content"].rstrip()
374      result[-1] = {**last, "text": {**last["text"], "content": stripped}}
375      return [c for c in result if c["text"]["content"]]
376  
377  
378  # ── Notion block / rich text constructors ────────────────────────────────────
379  
380  
381  def _text_chunk(
382      content: str,
383      *,
384      bold: bool = False,
385      italic: bool = False,
386      link_url: str | None = None,
387  ) -> dict[str, Any]:
388      """Build a Notion rich text ``text`` object.
389  
390      Args:
391          content: Text string.
392          bold: Apply bold annotation.
393          italic: Apply italic annotation.
394          link_url: If set, attach as an inline hyperlink.
395  
396      Returns:
397          Notion rich text dict.
398      """
399      text: dict[str, Any] = {"content": content}
400      if link_url:
401          text["link"] = {"url": link_url}
402      chunk: dict[str, Any] = {"type": "text", "text": text}
403      if bold or italic:
404          chunk["annotations"] = {"bold": bold, "italic": italic}
405      return chunk
406  
407  
408  def _paragraph(rich_text: list[dict[str, Any]]) -> dict[str, Any]:
409      return {"type": "paragraph", "paragraph": {"rich_text": rich_text}}
410  
411  
412  def _heading_3(rich_text: list[dict[str, Any]]) -> dict[str, Any]:
413      return {"type": "heading_3", "heading_3": {"rich_text": rich_text}}
414  
415  
416  def _bulleted_list_item(rich_text: list[dict[str, Any]]) -> dict[str, Any]:
417      return {"type": "bulleted_list_item", "bulleted_list_item": {"rich_text": rich_text}}