markdown_to_speech.py
1 """ 2 Markdown to Speech Preprocessor 3 4 Converts markdown-formatted text to natural, speakable text suitable for 5 Text-to-Speech (TTS) engines. 6 7 Uses markdown-it-py for robust markdown parsing and BeautifulSoup for 8 HTML text extraction. 9 """ 10 import re 11 import html 12 from typing import Optional 13 from dataclasses import dataclass 14 15 # Maximum input length to prevent resource exhaustion (100KB) 16 # This matches the limit in stream_speech API endpoint 17 MAX_INPUT_LENGTH = 100 * 1024 18 19 20 @dataclass 21 class MarkdownToSpeechOptions: 22 """Configuration options for markdown to speech conversion.""" 23 24 # Whether to announce code blocks (e.g., "Code block: print hello") 25 read_code_blocks: bool = False 26 27 # Whether to announce images (e.g., "Image: description") 28 read_images: bool = True 29 30 # Whether to read citation references like [1], [2] 31 read_citations: bool = True 32 33 # Format for citations. Use {n} as placeholder for the number 34 # Set to empty string to skip citations entirely 35 citation_format: str = "reference {n}" 36 37 # Whether to add pauses (periods) after headers 38 add_header_pauses: bool = True 39 40 # Prefix for code blocks when read_code_blocks is True 41 code_block_prefix: str = "Code block." 42 43 # Placeholder for code blocks when read_code_blocks is False 44 # Set to empty string to completely remove code blocks 45 code_block_placeholder: str = "Code omitted." 46 47 # Prefix for images when read_images is True 48 image_prefix: str = "Image:" 49 50 51 # Default options instance 52 DEFAULT_OPTIONS = MarkdownToSpeechOptions() 53 54 55 def markdown_to_speech( 56 text: str, options: Optional[MarkdownToSpeechOptions] = None 57 ) -> str: 58 """ 59 Convert markdown text to natural speech-friendly text. 60 61 This function uses markdown-it-py to parse markdown into HTML, 62 then uses BeautifulSoup to extract clean text. 63 64 Args: 65 text: Markdown-formatted text 66 options: Optional configuration for conversion behavior 67 68 Returns: 69 Plain text suitable for TTS 70 71 Examples: 72 >>> markdown_to_speech("This is **bold** text") 73 'This is bold text' 74 75 >>> markdown_to_speech("Click [here](https://example.com)") 76 'Click here' 77 """ 78 if not text: 79 return "" 80 81 # Truncate input to prevent resource exhaustion 82 if len(text) > MAX_INPUT_LENGTH: 83 text = text[:MAX_INPUT_LENGTH] 84 85 opts = options or DEFAULT_OPTIONS 86 87 # Step 1: Handle SAM-specific citations BEFORE markdown parsing 88 # (markdown-it doesn't recognize these custom formats) 89 result = _handle_sam_citations(text, opts) 90 91 # Step 2: Handle code blocks specially (before markdown parsing) 92 # We need to do this first because we want to control how they're rendered 93 result = _handle_code_blocks_pre(result, opts) 94 95 # Step 3: Handle ordered lists to preserve numbers (before markdown parsing) 96 # markdown-it renders <ol><li> which loses the original numbers 97 result = _handle_ordered_lists_pre(result) 98 99 # Step 4: Convert markdown to HTML using markdown-it-py 100 result = _markdown_to_html(result) 101 102 # Step 5: Extract text from HTML using BeautifulSoup 103 result = _html_to_text(result, opts) 104 105 # Step 6: Handle bare URLs that might have been missed 106 result = _handle_bare_urls(result) 107 108 # Step 7: Normalize whitespace for natural speech 109 result = _normalize_whitespace(result) 110 111 return result.strip() 112 113 114 def _markdown_to_html(text: str) -> str: 115 """Convert markdown to HTML using markdown-it-py. 116 """ 117 from markdown_it import MarkdownIt 118 119 # Create parser with commonmark preset 120 # and enable tables and strikethrough for better markdown support 121 md = MarkdownIt("commonmark").enable(["table", "strikethrough"]) 122 return md.render(text) 123 124 125 def _html_to_text(html_content: str, opts: MarkdownToSpeechOptions) -> str: 126 """Extract plain text from HTML using BeautifulSoup. 127 128 """ 129 from bs4 import BeautifulSoup 130 131 soup = BeautifulSoup(html_content, "html.parser") 132 133 # Handle images - either announce them or remove them 134 for img in soup.find_all("img"): 135 alt_text = img.get("alt", "").strip() 136 if opts.read_images and alt_text: 137 img.replace_with(f" {opts.image_prefix} {alt_text}. ") 138 else: 139 img.decompose() 140 141 # Handle code blocks 142 for code in soup.find_all("pre"): 143 if opts.read_code_blocks: 144 code.replace_with(f" {opts.code_block_prefix} ") 145 else: 146 code.decompose() 147 148 # Add periods after headers for natural pauses 149 if opts.add_header_pauses: 150 for header in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]): 151 header_text = header.get_text().strip() 152 header.replace_with(f"{header_text}. ") 153 154 # Get text content 155 text = soup.get_text(separator=" ") 156 157 # Decode any remaining HTML entities 158 text = html.unescape(text) 159 160 return text 161 162 163 def _handle_code_blocks_pre(text: str, opts: MarkdownToSpeechOptions) -> str: 164 """ 165 Pre-process code blocks before markdown parsing. 166 This ensures we have control over how they're handled. 167 """ 168 if opts.read_code_blocks: 169 replacement = f" {opts.code_block_prefix} " 170 elif opts.code_block_placeholder: 171 replacement = f" {opts.code_block_placeholder} " 172 else: 173 replacement = " " 174 result = [] 175 i = 0 176 text_len = len(text) 177 178 while i < text_len: 179 # Look for opening ``` 180 if text[i : i + 3] == "```": 181 # Find the closing ``` 182 # Skip past the opening ``` and any language identifier 183 start = i + 3 184 # Skip language identifier (word characters until newline or space) 185 while start < text_len and text[start] not in "\n \t`": 186 start += 1 187 # Find closing ``` 188 close_pos = text.find("```", start) 189 if close_pos != -1: 190 # Found a complete code block, replace it 191 result.append(replacement) 192 i = close_pos + 3 193 else: 194 # No closing ```, treat as regular text 195 result.append(text[i]) 196 i += 1 197 else: 198 result.append(text[i]) 199 i += 1 200 201 return "".join(result) 202 203 204 def _handle_ordered_lists_pre(text: str) -> str: 205 """ 206 Pre-process ordered lists to preserve numbers before markdown parsing. 207 208 markdown-it renders ordered lists as <ol><li> which loses the original 209 numbers. This function converts "1. Item" to "1, Item" so the numbers 210 are preserved in the final text. 211 """ 212 lines = text.split("\n") 213 result_lines = [] 214 for line in lines: 215 # Match ordered list items: "1. Item" or " 1. Item" (with leading spaces) 216 # Using explicit character classes to avoid regex quantifier issues 217 stripped = line.lstrip(" \t") 218 if stripped and len(stripped) > 2: 219 # Check if line starts with digit(s) followed by ". " 220 dot_pos = stripped.find(". ") 221 if dot_pos > 0 and dot_pos <= 10: # Reasonable limit for list numbers 222 prefix = stripped[:dot_pos] 223 if prefix.isdigit(): 224 # Replace "N. " with "N, " to preserve the number 225 leading_space = line[: len(line) - len(stripped)] 226 rest = stripped[dot_pos + 2 :] 227 result_lines.append(f"{leading_space}{prefix}, {rest}") 228 continue 229 result_lines.append(line) 230 return "\n".join(result_lines) 231 232 233 def _handle_sam_citations(text: str, opts: MarkdownToSpeechOptions) -> str: 234 """ 235 Handle SAM-specific citation formats that markdown parsers don't recognize. 236 237 Formats handled: 238 - Simple: [1], [2], etc. 239 - SAM cite format: [[cite:search0]], [[cite:research0]], [[cite:file0]], [[cite:ref0]] 240 - Web search format: [[cite:s1r1]], [[cite:s2r3]] (s=search turn, r=result index) 241 - Multi-citations: [[cite:search0, search1, search2]] 242 - Single bracket variants: [cite:search0] 243 """ 244 if not opts.read_citations: 245 # Remove all citation formats entirely 246 text = re.sub(r"\[?\[cite:[^\]]+\]\]?", "", text) 247 text = re.sub(r"\[(\d+)\]", "", text) 248 return text 249 250 # Handle web search format: [[cite:s1r1]], [[cite:s2r3]] 251 def replace_web_search_citation(match): 252 search_turn = match.group(1) 253 result_index = match.group(2) 254 return f", search {search_turn} result {result_index}," 255 256 text = re.sub( 257 r"\[?\[cite:s(\d+)r(\d+)\]\]?", replace_web_search_citation, text 258 ) 259 260 # Handle SAM-style multi-citations 261 def replace_multi_citation(match): 262 content = match.group(1) 263 individual_pattern = r"(?:cite:)?(file|ref|search|research)?(\d+)" 264 citations = re.findall(individual_pattern, content) 265 if not citations: 266 return "" 267 268 spoken_parts = [] 269 for cite_type, num in citations: 270 cite_type = cite_type or "search" 271 display_num = str(int(num) + 1) 272 if cite_type == "research": 273 spoken_parts.append(f"research source {display_num}") 274 elif cite_type == "search": 275 spoken_parts.append(f"source {display_num}") 276 elif cite_type == "file": 277 spoken_parts.append(f"file {display_num}") 278 elif cite_type == "ref": 279 spoken_parts.append(f"reference {display_num}") 280 else: 281 spoken_parts.append(f"source {display_num}") 282 283 if len(spoken_parts) == 1: 284 return f", {spoken_parts[0]}," 285 elif len(spoken_parts) == 2: 286 return f", {spoken_parts[0]} and {spoken_parts[1]}," 287 else: 288 return f', {", ".join(spoken_parts[:-1])}, and {spoken_parts[-1]},' 289 290 multi_cite_pattern = r"\[?\[cite:((?:(?:file|ref|search|research)?\d+)(?:\s*,\s*(?:cite:)?(?:file|ref|search|research)?\d+)+)\]\]?" 291 text = re.sub(multi_cite_pattern, replace_multi_citation, text) 292 293 # Handle SAM-style single citations 294 def replace_sam_citation(match): 295 cite_type = match.group(1) or "search" 296 num = match.group(2) 297 display_num = str(int(num) + 1) 298 299 if cite_type == "research": 300 spoken = f"research source {display_num}" 301 elif cite_type == "search": 302 spoken = f"source {display_num}" 303 elif cite_type == "file": 304 spoken = f"file {display_num}" 305 elif cite_type == "ref": 306 spoken = f"reference {display_num}" 307 else: 308 spoken = f"source {display_num}" 309 310 return f", {spoken}," 311 312 sam_cite_pattern = r"\[?\[cite:(?:(file|ref|search|research))?(\d+)\]\]?" 313 text = re.sub(sam_cite_pattern, replace_sam_citation, text) 314 315 # Handle simple citations [1], [2], etc. 316 if opts.citation_format: 317 318 def replace_simple_citation(match): 319 num = match.group(1) 320 spoken = opts.citation_format.replace("{n}", num) 321 return f", {spoken}," 322 323 text = re.sub(r"\[(\d+)\]", replace_simple_citation, text) 324 else: 325 text = re.sub(r"\[(\d+)\]", "", text) 326 327 return text 328 329 330 def _handle_bare_urls(text: str) -> str: 331 """Replace bare URLs with 'link' for natural speech.""" 332 url_pattern = r"(?<!\()\bhttps?://[^\s<>\[\]()]+\b" 333 return re.sub(url_pattern, "link", text) 334 335 336 def _normalize_whitespace(text: str) -> str: 337 """ 338 Normalize whitespace for natural speech. 339 340 """ 341 # Replace newlines/carriage returns with single space 342 text = text.replace("\r\n", " ").replace("\n", " ").replace("\r", " ") 343 344 # Replace multiple spaces with single space using split/join 345 # This is more efficient than regex for this simple case 346 text = " ".join(text.split()) 347 348 # Clean up punctuation spacing - remove spaces before punctuation 349 # Using character-by-character replacement to avoid regex quantifiers 350 for punct in ".,!?;:": 351 text = text.replace(f" {punct}", punct) 352 353 # Remove duplicate commas from citation handling 354 while ",," in text or ", ," in text: 355 text = text.replace(",,", ",").replace(", ,", ",") 356 357 return text