Cradicle Explorer

/ src / solace_agent_mesh / common / utils / markdown_to_speech.py
markdown_to_speech.py
  1  """
  2  Markdown to Speech Preprocessor
  3  
  4  Converts markdown-formatted text to natural, speakable text suitable for
  5  Text-to-Speech (TTS) engines.
  6  
  7  Uses markdown-it-py for robust markdown parsing and BeautifulSoup for
  8  HTML text extraction.
  9  """
 10  import re
 11  import html
 12  from typing import Optional
 13  from dataclasses import dataclass
 14  
 15  # Maximum input length to prevent resource exhaustion (100KB)
 16  # This matches the limit in stream_speech API endpoint
 17  MAX_INPUT_LENGTH = 100 * 1024
 18  
 19  
 20  @dataclass
 21  class MarkdownToSpeechOptions:
 22      """Configuration options for markdown to speech conversion."""
 23  
 24      # Whether to announce code blocks (e.g., "Code block: print hello")
 25      read_code_blocks: bool = False
 26  
 27      # Whether to announce images (e.g., "Image: description")
 28      read_images: bool = True
 29  
 30      # Whether to read citation references like [1], [2]
 31      read_citations: bool = True
 32  
 33      # Format for citations. Use {n} as placeholder for the number
 34      # Set to empty string to skip citations entirely
 35      citation_format: str = "reference {n}"
 36  
 37      # Whether to add pauses (periods) after headers
 38      add_header_pauses: bool = True
 39  
 40      # Prefix for code blocks when read_code_blocks is True
 41      code_block_prefix: str = "Code block."
 42  
 43      # Placeholder for code blocks when read_code_blocks is False
 44      # Set to empty string to completely remove code blocks
 45      code_block_placeholder: str = "Code omitted."
 46  
 47      # Prefix for images when read_images is True
 48      image_prefix: str = "Image:"
 49  
 50  
 51  # Default options instance
 52  DEFAULT_OPTIONS = MarkdownToSpeechOptions()
 53  
 54  
 55  def markdown_to_speech(
 56      text: str, options: Optional[MarkdownToSpeechOptions] = None
 57  ) -> str:
 58      """
 59      Convert markdown text to natural speech-friendly text.
 60  
 61      This function uses markdown-it-py to parse markdown into HTML,
 62      then uses BeautifulSoup to extract clean text.
 63  
 64      Args:
 65          text: Markdown-formatted text
 66          options: Optional configuration for conversion behavior
 67  
 68      Returns:
 69          Plain text suitable for TTS
 70  
 71      Examples:
 72          >>> markdown_to_speech("This is **bold** text")
 73          'This is bold text'
 74  
 75          >>> markdown_to_speech("Click [here](https://example.com)")
 76          'Click here'
 77      """
 78      if not text:
 79          return ""
 80  
 81      # Truncate input to prevent resource exhaustion
 82      if len(text) > MAX_INPUT_LENGTH:
 83          text = text[:MAX_INPUT_LENGTH]
 84  
 85      opts = options or DEFAULT_OPTIONS
 86  
 87      # Step 1: Handle SAM-specific citations BEFORE markdown parsing
 88      # (markdown-it doesn't recognize these custom formats)
 89      result = _handle_sam_citations(text, opts)
 90  
 91      # Step 2: Handle code blocks specially (before markdown parsing)
 92      # We need to do this first because we want to control how they're rendered
 93      result = _handle_code_blocks_pre(result, opts)
 94  
 95      # Step 3: Handle ordered lists to preserve numbers (before markdown parsing)
 96      # markdown-it renders <ol><li> which loses the original numbers
 97      result = _handle_ordered_lists_pre(result)
 98  
 99      # Step 4: Convert markdown to HTML using markdown-it-py
100      result = _markdown_to_html(result)
101  
102      # Step 5: Extract text from HTML using BeautifulSoup
103      result = _html_to_text(result, opts)
104  
105      # Step 6: Handle bare URLs that might have been missed
106      result = _handle_bare_urls(result)
107  
108      # Step 7: Normalize whitespace for natural speech
109      result = _normalize_whitespace(result)
110  
111      return result.strip()
112  
113  
114  def _markdown_to_html(text: str) -> str:
115      """Convert markdown to HTML using markdown-it-py.
116      """
117      from markdown_it import MarkdownIt
118  
119      # Create parser with commonmark preset 
120      # and enable tables and strikethrough for better markdown support
121      md = MarkdownIt("commonmark").enable(["table", "strikethrough"])
122      return md.render(text)
123  
124  
125  def _html_to_text(html_content: str, opts: MarkdownToSpeechOptions) -> str:
126      """Extract plain text from HTML using BeautifulSoup.
127  
128      """
129      from bs4 import BeautifulSoup
130  
131      soup = BeautifulSoup(html_content, "html.parser")
132  
133      # Handle images - either announce them or remove them
134      for img in soup.find_all("img"):
135          alt_text = img.get("alt", "").strip()
136          if opts.read_images and alt_text:
137              img.replace_with(f" {opts.image_prefix} {alt_text}. ")
138          else:
139              img.decompose()
140  
141      # Handle code blocks
142      for code in soup.find_all("pre"):
143          if opts.read_code_blocks:
144              code.replace_with(f" {opts.code_block_prefix} ")
145          else:
146              code.decompose()
147  
148      # Add periods after headers for natural pauses
149      if opts.add_header_pauses:
150          for header in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
151              header_text = header.get_text().strip()
152              header.replace_with(f"{header_text}. ")
153  
154      # Get text content
155      text = soup.get_text(separator=" ")
156  
157      # Decode any remaining HTML entities
158      text = html.unescape(text)
159  
160      return text
161  
162  
163  def _handle_code_blocks_pre(text: str, opts: MarkdownToSpeechOptions) -> str:
164      """
165      Pre-process code blocks before markdown parsing.
166      This ensures we have control over how they're handled.
167      """
168      if opts.read_code_blocks:
169          replacement = f" {opts.code_block_prefix} "
170      elif opts.code_block_placeholder:
171          replacement = f" {opts.code_block_placeholder} "
172      else:
173          replacement = " "
174      result = []
175      i = 0
176      text_len = len(text)
177  
178      while i < text_len:
179          # Look for opening ```
180          if text[i : i + 3] == "```":
181              # Find the closing ```
182              # Skip past the opening ``` and any language identifier
183              start = i + 3
184              # Skip language identifier (word characters until newline or space)
185              while start < text_len and text[start] not in "\n \t`":
186                  start += 1
187              # Find closing ```
188              close_pos = text.find("```", start)
189              if close_pos != -1:
190                  # Found a complete code block, replace it
191                  result.append(replacement)
192                  i = close_pos + 3
193              else:
194                  # No closing ```, treat as regular text
195                  result.append(text[i])
196                  i += 1
197          else:
198              result.append(text[i])
199              i += 1
200  
201      return "".join(result)
202  
203  
204  def _handle_ordered_lists_pre(text: str) -> str:
205      """
206      Pre-process ordered lists to preserve numbers before markdown parsing.
207  
208      markdown-it renders ordered lists as <ol><li> which loses the original
209      numbers. This function converts "1. Item" to "1, Item" so the numbers
210      are preserved in the final text.
211      """
212      lines = text.split("\n")
213      result_lines = []
214      for line in lines:
215          # Match ordered list items: "1. Item" or "  1. Item" (with leading spaces)
216          # Using explicit character classes to avoid regex quantifier issues
217          stripped = line.lstrip(" \t")
218          if stripped and len(stripped) > 2:
219              # Check if line starts with digit(s) followed by ". "
220              dot_pos = stripped.find(". ")
221              if dot_pos > 0 and dot_pos <= 10:  # Reasonable limit for list numbers
222                  prefix = stripped[:dot_pos]
223                  if prefix.isdigit():
224                      # Replace "N. " with "N, " to preserve the number
225                      leading_space = line[: len(line) - len(stripped)]
226                      rest = stripped[dot_pos + 2 :]
227                      result_lines.append(f"{leading_space}{prefix}, {rest}")
228                      continue
229          result_lines.append(line)
230      return "\n".join(result_lines)
231  
232  
233  def _handle_sam_citations(text: str, opts: MarkdownToSpeechOptions) -> str:
234      """
235      Handle SAM-specific citation formats that markdown parsers don't recognize.
236  
237      Formats handled:
238      - Simple: [1], [2], etc.
239      - SAM cite format: [[cite:search0]], [[cite:research0]], [[cite:file0]], [[cite:ref0]]
240      - Web search format: [[cite:s1r1]], [[cite:s2r3]] (s=search turn, r=result index)
241      - Multi-citations: [[cite:search0, search1, search2]]
242      - Single bracket variants: [cite:search0]
243      """
244      if not opts.read_citations:
245          # Remove all citation formats entirely
246          text = re.sub(r"\[?\[cite:[^\]]+\]\]?", "", text)
247          text = re.sub(r"\[(\d+)\]", "", text)
248          return text
249  
250      # Handle web search format: [[cite:s1r1]], [[cite:s2r3]]
251      def replace_web_search_citation(match):
252          search_turn = match.group(1)
253          result_index = match.group(2)
254          return f", search {search_turn} result {result_index},"
255  
256      text = re.sub(
257          r"\[?\[cite:s(\d+)r(\d+)\]\]?", replace_web_search_citation, text
258      )
259  
260      # Handle SAM-style multi-citations
261      def replace_multi_citation(match):
262          content = match.group(1)
263          individual_pattern = r"(?:cite:)?(file|ref|search|research)?(\d+)"
264          citations = re.findall(individual_pattern, content)
265          if not citations:
266              return ""
267  
268          spoken_parts = []
269          for cite_type, num in citations:
270              cite_type = cite_type or "search"
271              display_num = str(int(num) + 1)
272              if cite_type == "research":
273                  spoken_parts.append(f"research source {display_num}")
274              elif cite_type == "search":
275                  spoken_parts.append(f"source {display_num}")
276              elif cite_type == "file":
277                  spoken_parts.append(f"file {display_num}")
278              elif cite_type == "ref":
279                  spoken_parts.append(f"reference {display_num}")
280              else:
281                  spoken_parts.append(f"source {display_num}")
282  
283          if len(spoken_parts) == 1:
284              return f", {spoken_parts[0]},"
285          elif len(spoken_parts) == 2:
286              return f", {spoken_parts[0]} and {spoken_parts[1]},"
287          else:
288              return f', {", ".join(spoken_parts[:-1])}, and {spoken_parts[-1]},'
289  
290      multi_cite_pattern = r"\[?\[cite:((?:(?:file|ref|search|research)?\d+)(?:\s*,\s*(?:cite:)?(?:file|ref|search|research)?\d+)+)\]\]?"
291      text = re.sub(multi_cite_pattern, replace_multi_citation, text)
292  
293      # Handle SAM-style single citations
294      def replace_sam_citation(match):
295          cite_type = match.group(1) or "search"
296          num = match.group(2)
297          display_num = str(int(num) + 1)
298  
299          if cite_type == "research":
300              spoken = f"research source {display_num}"
301          elif cite_type == "search":
302              spoken = f"source {display_num}"
303          elif cite_type == "file":
304              spoken = f"file {display_num}"
305          elif cite_type == "ref":
306              spoken = f"reference {display_num}"
307          else:
308              spoken = f"source {display_num}"
309  
310          return f", {spoken},"
311  
312      sam_cite_pattern = r"\[?\[cite:(?:(file|ref|search|research))?(\d+)\]\]?"
313      text = re.sub(sam_cite_pattern, replace_sam_citation, text)
314  
315      # Handle simple citations [1], [2], etc.
316      if opts.citation_format:
317  
318          def replace_simple_citation(match):
319              num = match.group(1)
320              spoken = opts.citation_format.replace("{n}", num)
321              return f", {spoken},"
322  
323          text = re.sub(r"\[(\d+)\]", replace_simple_citation, text)
324      else:
325          text = re.sub(r"\[(\d+)\]", "", text)
326  
327      return text
328  
329  
330  def _handle_bare_urls(text: str) -> str:
331      """Replace bare URLs with 'link' for natural speech."""
332      url_pattern = r"(?<!\()\bhttps?://[^\s<>\[\]()]+\b"
333      return re.sub(url_pattern, "link", text)
334  
335  
336  def _normalize_whitespace(text: str) -> str:
337      """
338      Normalize whitespace for natural speech.
339  
340      """
341      # Replace newlines/carriage returns with single space
342      text = text.replace("\r\n", " ").replace("\n", " ").replace("\r", " ")
343  
344      # Replace multiple spaces with single space using split/join
345      # This is more efficient than regex for this simple case
346      text = " ".join(text.split())
347  
348      # Clean up punctuation spacing - remove spaces before punctuation
349      # Using character-by-character replacement to avoid regex quantifiers
350      for punct in ".,!?;:":
351          text = text.replace(f" {punct}", punct)
352  
353      # Remove duplicate commas from citation handling
354      while ",," in text or ", ," in text:
355          text = text.replace(",,", ",").replace(", ,", ",")
356  
357      return text