mistune.py
1 # coding: utf-8 2 """ 3 mistune 4 ~~~~~~~ 5 The fastest markdown parser in pure Python with renderer feature. 6 :copyright: (c) 2014 - 2018 by Hsiaoming Yang. 7 """ 8 9 import re 10 import inspect 11 12 __version__ = '0.8.4' 13 __author__ = 'Hsiaoming Yang <me@lepture.com>' 14 __all__ = [ 15 'BlockGrammar', 'BlockLexer', 16 'InlineGrammar', 'InlineLexer', 17 'Renderer', 'Markdown', 18 'markdown', 'escape', 19 ] 20 21 22 _key_pattern = re.compile(r'\s+') 23 _nonalpha_pattern = re.compile(r'\W') 24 _escape_pattern = re.compile(r'&(?!#?\w+;)') 25 _newline_pattern = re.compile(r'\r\n|\r') 26 _block_quote_leading_pattern = re.compile(r'^ *> ?', flags=re.M) 27 _block_code_leading_pattern = re.compile(r'^ {4}', re.M) 28 _inline_tags = [ 29 'a', 'em', 'strong', 'small', 's', 'cite', 'q', 'dfn', 'abbr', 'data', 30 'time', 'code', 'var', 'samp', 'kbd', 'sub', 'sup', 'i', 'b', 'u', 'mark', 31 'ruby', 'rt', 'rp', 'bdi', 'bdo', 'span', 'br', 'wbr', 'ins', 'del', 32 'img', 'font', 33 ] 34 _pre_tags = ['pre', 'script', 'style'] 35 _valid_end = r'(?!:/|[^\w\s@]*@)\b' 36 _valid_attr = r'''\s*[a-zA-Z\-](?:\s*\=\s*(?:"[^"]*"|'[^']*'|[^\s'">]+))?''' 37 _block_tag = r'(?!(?:%s)\b)\w+%s' % ('|'.join(_inline_tags), _valid_end) 38 _scheme_blacklist = ('javascript:', 'vbscript:') 39 40 41 def _pure_pattern(regex): 42 pattern = regex.pattern 43 if pattern.startswith('^'): 44 pattern = pattern[1:] 45 return pattern 46 47 48 def _keyify(key): 49 key = escape(key.lower(), quote=True) 50 return _key_pattern.sub(' ', key) 51 52 53 def escape(text, quote=False, smart_amp=True): 54 """Replace special characters "&", "<" and ">" to HTML-safe sequences. 55 The original cgi.escape will always escape "&", but you can control 56 this one for a smart escape amp. 57 :param quote: if set to True, " and ' will be escaped. 58 :param smart_amp: if set to False, & will always be escaped. 59 """ 60 if smart_amp: 61 text = _escape_pattern.sub('&', text) 62 else: 63 text = text.replace('&', '&') 64 text = text.replace('<', '<') 65 text = text.replace('>', '>') 66 if quote: 67 text = text.replace('"', '"') 68 text = text.replace("'", ''') 69 return text 70 71 72 def escape_link(url): 73 """Remove dangerous URL schemes like javascript: and escape afterwards.""" 74 lower_url = url.lower().strip('\x00\x1a \n\r\t') 75 76 for scheme in _scheme_blacklist: 77 if re.sub(r'[^A-Za-z0-9\/:]+', '', lower_url).startswith(scheme): 78 return '' 79 return escape(url, quote=True, smart_amp=False) 80 81 82 def preprocessing(text, tab=4): 83 text = _newline_pattern.sub('\n', text) 84 text = text.expandtabs(tab) 85 text = text.replace('\u2424', '\n') 86 pattern = re.compile(r'^ +$', re.M) 87 return pattern.sub('', text) 88 89 90 class BlockGrammar(object): 91 """Grammars for block level tokens.""" 92 93 def_links = re.compile( 94 r'^ *\[([^^\]]+)\]: *' # [key]: 95 r'<?([^\s>]+)>?' # <link> or link 96 r'(?: +["(]([^\n]+)[")])? *(?:\n+|$)' 97 ) 98 def_footnotes = re.compile( 99 r'^\[\^([^\]]+)\]: *(' 100 r'[^\n]*(?:\n+|$)' # [^key]: 101 r'(?: {1,}[^\n]*(?:\n+|$))*' 102 r')' 103 ) 104 105 newline = re.compile(r'^\n+') 106 block_code = re.compile(r'^( {4}[^\n]+\n*)+') 107 fences = re.compile( 108 r'^ *(`{3,}|~{3,}) *([^`\s]+)? *\n' # ```lang 109 r'([\s\S]+?)\s*' 110 r'\1 *(?:\n+|$)' # ``` 111 ) 112 hrule = re.compile(r'^ {0,3}[-*_](?: *[-*_]){2,} *(?:\n+|$)') 113 heading = re.compile(r'^ *(#{1,6}) *([^\n]+?) *#* *(?:\n+|$)') 114 lheading = re.compile(r'^([^\n]+)\n *(=|-)+ *(?:\n+|$)') 115 block_quote = re.compile(r'^( *>[^\n]+(\n[^\n]+)*\n*)+') 116 list_block = re.compile( 117 r'^( *)(?=[*+-]|\d+\.)(([*+-])?(?:\d+\.)?) [\s\S]+?' 118 r'(?:' 119 r'\n+(?=\1?(?:[-*_] *){3,}(?:\n+|$))' # hrule 120 r'|\n+(?=%s)' # def links 121 r'|\n+(?=%s)' # def footnotes\ 122 r'|\n+(?=\1(?(3)\d+\.|[*+-]) )' # heterogeneous bullet 123 r'|\n{2,}' 124 r'(?! )' 125 r'(?!\1(?:[*+-]|\d+\.) )\n*' 126 r'|' 127 r'\s*$)' % ( 128 _pure_pattern(def_links), 129 _pure_pattern(def_footnotes), 130 ) 131 ) 132 list_item = re.compile( 133 r'^(( *)(?:[*+-]|\d+\.) [^\n]*' 134 r'(?:\n(?!\2(?:[*+-]|\d+\.) )[^\n]*)*)', 135 flags=re.M 136 ) 137 list_bullet = re.compile(r'^ *(?:[*+-]|\d+\.) +') 138 paragraph = re.compile( 139 r'^((?:[^\n]+\n?(?!' 140 r'%s|%s|%s|%s|%s|%s|%s|%s|%s' 141 r'))+)\n*' % ( 142 _pure_pattern(fences).replace(r'\1', r'\2'), 143 _pure_pattern(list_block).replace(r'\1', r'\3'), 144 _pure_pattern(hrule), 145 _pure_pattern(heading), 146 _pure_pattern(lheading), 147 _pure_pattern(block_quote), 148 _pure_pattern(def_links), 149 _pure_pattern(def_footnotes), 150 '<' + _block_tag, 151 ) 152 ) 153 block_html = re.compile( 154 r'^ *(?:%s|%s|%s) *(?:\n{2,}|\s*$)' % ( 155 r'<!--[\s\S]*?-->', 156 r'<(%s)((?:%s)*?)>([\s\S]*?)<\/\1>' % (_block_tag, _valid_attr), 157 r'<%s(?:%s)*?\s*\/?>' % (_block_tag, _valid_attr), 158 ) 159 ) 160 table = re.compile( 161 r'^ *\|(.+)\n *\|( *[-:]+[-| :]*)\n((?: *\|.*(?:\n|$))*)\n*' 162 ) 163 nptable = re.compile( 164 r'^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)\n*' 165 ) 166 text = re.compile(r'^[^\n]+') 167 168 169 class BlockLexer(object): 170 """Block level lexer for block grammars.""" 171 grammar_class = BlockGrammar 172 173 default_rules = [ 174 'newline', 'hrule', 'block_code', 'fences', 'heading', 175 'nptable', 'lheading', 'block_quote', 176 'list_block', 'block_html', 'def_links', 177 'def_footnotes', 'table', 'paragraph', 'text' 178 ] 179 180 list_rules = ( 181 'newline', 'block_code', 'fences', 'lheading', 'hrule', 182 'block_quote', 'list_block', 'block_html', 'text', 183 ) 184 185 footnote_rules = ( 186 'newline', 'block_code', 'fences', 'heading', 187 'nptable', 'lheading', 'hrule', 'block_quote', 188 'list_block', 'block_html', 'table', 'paragraph', 'text' 189 ) 190 191 def __init__(self, rules=None, **kwargs): 192 self.tokens = [] 193 self.def_links = {} 194 self.def_footnotes = {} 195 196 if not rules: 197 rules = self.grammar_class() 198 199 self.rules = rules 200 self._max_recursive_depth = kwargs.get('max_recursive_depth', 6) 201 self._list_depth = 0 202 self._blockquote_depth = 0 203 204 def __call__(self, text, rules=None): 205 return self.parse(text, rules) 206 207 def parse(self, text, rules=None): 208 text = text.rstrip('\n') 209 210 if not rules: 211 rules = self.default_rules 212 213 def manipulate(text): 214 for key in rules: 215 rule = getattr(self.rules, key) 216 m = rule.match(text) 217 if not m: 218 continue 219 getattr(self, 'parse_%s' % key)(m) 220 return m 221 return False # pragma: no cover 222 223 while text: 224 m = manipulate(text) 225 if m is not False: 226 text = text[len(m.group(0)):] 227 continue 228 if text: # pragma: no cover 229 raise RuntimeError('Infinite loop at: %s' % text) 230 return self.tokens 231 232 def parse_newline(self, m): 233 length = len(m.group(0)) 234 if length > 1: 235 self.tokens.append({'type': 'newline'}) 236 237 def parse_block_code(self, m): 238 # clean leading whitespace 239 code = _block_code_leading_pattern.sub('', m.group(0)) 240 self.tokens.append({ 241 'type': 'code', 242 'lang': None, 243 'text': code, 244 }) 245 246 def parse_fences(self, m): 247 self.tokens.append({ 248 'type': 'code', 249 'lang': m.group(2), 250 'text': m.group(3), 251 }) 252 253 def parse_heading(self, m): 254 self.tokens.append({ 255 'type': 'heading', 256 'level': len(m.group(1)), 257 'text': m.group(2), 258 }) 259 260 def parse_lheading(self, m): 261 """Parse setext heading.""" 262 self.tokens.append({ 263 'type': 'heading', 264 'level': 1 if m.group(2) == '=' else 2, 265 'text': m.group(1), 266 }) 267 268 def parse_hrule(self, m): 269 self.tokens.append({'type': 'hrule'}) 270 271 def parse_list_block(self, m): 272 bull = m.group(2) 273 self.tokens.append({ 274 'type': 'list_start', 275 'ordered': '.' in bull, 276 }) 277 self._list_depth += 1 278 if self._list_depth > self._max_recursive_depth: 279 self.tokens.append({'type': 'list_item_start'}) 280 self.parse_text(m) 281 self.tokens.append({'type': 'list_item_end'}) 282 else: 283 cap = m.group(0) 284 self._process_list_item(cap, bull) 285 self.tokens.append({'type': 'list_end'}) 286 self._list_depth -= 1 287 288 def _process_list_item(self, cap, bull): 289 cap = self.rules.list_item.findall(cap) 290 291 _next = False 292 length = len(cap) 293 294 for i in range(length): 295 item = cap[i][0] 296 297 # remove the bullet 298 space = len(item) 299 item = self.rules.list_bullet.sub('', item) 300 301 # outdent 302 if '\n ' in item: 303 space = space - len(item) 304 pattern = re.compile(r'^ {1,%d}' % space, flags=re.M) 305 item = pattern.sub('', item) 306 307 # determine whether item is loose or not 308 loose = _next 309 if not loose and re.search(r'\n\n(?!\s*$)', item): 310 loose = True 311 312 rest = len(item) 313 if i != length - 1 and rest: 314 _next = item[rest-1] == '\n' 315 if not loose: 316 loose = _next 317 318 if loose: 319 t = 'loose_item_start' 320 else: 321 t = 'list_item_start' 322 323 self.tokens.append({'type': t}) 324 # recurse 325 self.parse(item, self.list_rules) 326 self.tokens.append({'type': 'list_item_end'}) 327 328 def parse_block_quote(self, m): 329 self.tokens.append({'type': 'block_quote_start'}) 330 self._blockquote_depth += 1 331 if self._blockquote_depth > self._max_recursive_depth: 332 self.parse_text(m) 333 else: 334 # clean leading > 335 cap = _block_quote_leading_pattern.sub('', m.group(0)) 336 self.parse(cap) 337 self.tokens.append({'type': 'block_quote_end'}) 338 self._blockquote_depth -= 1 339 340 def parse_def_links(self, m): 341 key = _keyify(m.group(1)) 342 self.def_links[key] = { 343 'link': m.group(2), 344 'title': m.group(3), 345 } 346 347 def parse_def_footnotes(self, m): 348 key = _keyify(m.group(1)) 349 if key in self.def_footnotes: 350 # footnote is already defined 351 return 352 353 self.def_footnotes[key] = 0 354 355 self.tokens.append({ 356 'type': 'footnote_start', 357 'key': key, 358 }) 359 360 text = m.group(2) 361 362 if '\n' in text: 363 lines = text.split('\n') 364 whitespace = None 365 for line in lines[1:]: 366 space = len(line) - len(line.lstrip()) 367 if space and (not whitespace or space < whitespace): 368 whitespace = space 369 newlines = [lines[0]] 370 for line in lines[1:]: 371 newlines.append(line[whitespace:]) 372 text = '\n'.join(newlines) 373 374 self.parse(text, self.footnote_rules) 375 376 self.tokens.append({ 377 'type': 'footnote_end', 378 'key': key, 379 }) 380 381 def parse_table(self, m): 382 item = self._process_table(m) 383 384 cells = re.sub(r'(?: *\| *)?\n$', '', m.group(3)) 385 cells = cells.split('\n') 386 for i, v in enumerate(cells): 387 v = re.sub(r'^ *\| *| *\| *$', '', v) 388 cells[i] = re.split(r' *(?<!\\)\| *', v) 389 390 item['cells'] = self._process_cells(cells) 391 self.tokens.append(item) 392 393 def parse_nptable(self, m): 394 item = self._process_table(m) 395 396 cells = re.sub(r'\n$', '', m.group(3)) 397 cells = cells.split('\n') 398 for i, v in enumerate(cells): 399 cells[i] = re.split(r' *(?<!\\)\| *', v) 400 401 item['cells'] = self._process_cells(cells) 402 self.tokens.append(item) 403 404 def _process_table(self, m): 405 header = re.sub(r'^ *| *\| *$', '', m.group(1)) 406 header = re.split(r' *\| *', header) 407 align = re.sub(r' *|\| *$', '', m.group(2)) 408 align = re.split(r' *\| *', align) 409 410 for i, v in enumerate(align): 411 if re.search(r'^ *-+: *$', v): 412 align[i] = 'right' 413 elif re.search(r'^ *:-+: *$', v): 414 align[i] = 'center' 415 elif re.search(r'^ *:-+ *$', v): 416 align[i] = 'left' 417 else: 418 align[i] = None 419 420 item = { 421 'type': 'table', 422 'header': header, 423 'align': align, 424 } 425 return item 426 427 def _process_cells(self, cells): 428 for i, line in enumerate(cells): 429 for c, cell in enumerate(line): 430 # de-escape any pipe inside the cell here 431 cells[i][c] = re.sub('\\\\\|', '|', cell) 432 433 return cells 434 435 def parse_block_html(self, m): 436 tag = m.group(1) 437 if not tag: 438 text = m.group(0) 439 self.tokens.append({ 440 'type': 'close_html', 441 'text': text 442 }) 443 else: 444 attr = m.group(2) 445 text = m.group(3) 446 self.tokens.append({ 447 'type': 'open_html', 448 'tag': tag, 449 'extra': attr, 450 'text': text 451 }) 452 453 def parse_paragraph(self, m): 454 text = m.group(1).rstrip('\n') 455 self.tokens.append({'type': 'paragraph', 'text': text}) 456 457 def parse_text(self, m): 458 text = m.group(0) 459 self.tokens.append({'type': 'text', 'text': text}) 460 461 462 class InlineGrammar(object): 463 """Grammars for inline level tokens.""" 464 465 escape = re.compile(r'^\\([\\`*{}\[\]()#+\-.!_>~|])') # \* \+ \! .... 466 inline_html = re.compile( 467 r'^(?:%s|%s|%s)' % ( 468 r'<!--[\s\S]*?-->', 469 r'<(\w+%s)((?:%s)*?)\s*>([\s\S]*?)<\/\1>' % ( 470 _valid_end, _valid_attr), 471 r'<\w+%s(?:%s)*?\s*\/?>' % (_valid_end, _valid_attr), 472 ) 473 ) 474 autolink = re.compile(r'^<([^ >]+(@|:)[^ >]+)>') 475 link = re.compile( 476 r'^!?\[(' 477 r'(?:\[[^^\]]*\]|[^\[\]]|\](?=[^\[]*\]))*' 478 r')\]\(' 479 r'''\s*(<)?([\s\S]*?)(?(2)>)(?:\s+['"]([\s\S]*?)['"])?\s*''' 480 r'\)' 481 ) 482 reflink = re.compile( 483 r'^!?\[(' 484 r'(?:\[[^^\]]*\]|[^\[\]]|\](?=[^\[]*\]))*' 485 r')\]\s*\[([^^\]]*)\]' 486 ) 487 nolink = re.compile(r'^!?\[((?:\[[^\]]*\]|[^\[\]])*)\]') 488 url = re.compile(r'''^(https?:\/\/[^\s<]+[^<.,:;"')\]\s])''') 489 double_emphasis = re.compile( 490 r'^_{2}([\s\S]+?)_{2}(?!_)' # __word__ 491 r'|' 492 r'^\*{2}([\s\S]+?)\*{2}(?!\*)' # **word** 493 ) 494 emphasis = re.compile( 495 r'^\b_((?:__|[^_])+?)_\b' # _word_ 496 r'|' 497 r'^\*((?:\*\*|[^\*])+?)\*(?!\*)' # *word* 498 ) 499 code = re.compile(r'^(`+)\s*([\s\S]*?[^`])\s*\1(?!`)') # `code` 500 linebreak = re.compile(r'^ {2,}\n(?!\s*$)') 501 strikethrough = re.compile(r'^~~(?=\S)([\s\S]*?\S)~~') # ~~word~~ 502 footnote = re.compile(r'^\[\^([^\]]+)\]') 503 text = re.compile(r'^[\s\S]+?(?=[\\<!\[_*`~]|https?://| {2,}\n|$)') 504 505 def hard_wrap(self): 506 """Grammar for hard wrap linebreak. You don't need to add two 507 spaces at the end of a line. 508 """ 509 self.linebreak = re.compile(r'^ *\n(?!\s*$)') 510 self.text = re.compile( 511 r'^[\s\S]+?(?=[\\<!\[_*`~]|https?://| *\n|$)' 512 ) 513 514 515 class InlineLexer(object): 516 """Inline level lexer for inline grammars.""" 517 grammar_class = InlineGrammar 518 519 default_rules = [ 520 'escape', 'inline_html', 'autolink', 'url', 521 'footnote', 'link', 'reflink', 'nolink', 522 'double_emphasis', 'emphasis', 'code', 523 'linebreak', 'strikethrough', 'text', 524 ] 525 inline_html_rules = [ 526 'escape', 'inline_html', 'autolink', 'url', 'link', 'reflink', 527 'nolink', 'double_emphasis', 'emphasis', 'code', 528 'linebreak', 'strikethrough', 'text', 529 ] 530 531 def __init__(self, renderer, rules=None, **kwargs): 532 self.renderer = renderer 533 self.links = {} 534 self.footnotes = {} 535 self.footnote_index = 0 536 537 if not rules: 538 rules = self.grammar_class() 539 540 kwargs.update(self.renderer.options) 541 if kwargs.get('hard_wrap'): 542 rules.hard_wrap() 543 544 self.rules = rules 545 546 self._in_link = False 547 self._in_footnote = False 548 self._parse_inline_html = kwargs.get('parse_inline_html') 549 550 def __call__(self, text, rules=None): 551 return self.output(text, rules) 552 553 def setup(self, links, footnotes): 554 self.footnote_index = 0 555 self.links = links or {} 556 self.footnotes = footnotes or {} 557 558 def output(self, text, rules=None): 559 text = text.rstrip('\n') 560 if not rules: 561 rules = list(self.default_rules) 562 563 if self._in_footnote and 'footnote' in rules: 564 rules.remove('footnote') 565 566 output = self.renderer.placeholder() 567 568 def manipulate(text): 569 for key in rules: 570 pattern = getattr(self.rules, key) 571 m = pattern.match(text) 572 if not m: 573 continue 574 self.line_match = m 575 out = getattr(self, 'output_%s' % key)(m) 576 if out is not None: 577 return m, out 578 return False # pragma: no cover 579 580 while text: 581 ret = manipulate(text) 582 if ret is not False: 583 m, out = ret 584 output += out 585 text = text[len(m.group(0)):] 586 continue 587 if text: # pragma: no cover 588 raise RuntimeError('Infinite loop at: %s' % text) 589 590 return output 591 592 def output_escape(self, m): 593 text = m.group(1) 594 return self.renderer.escape(text) 595 596 def output_autolink(self, m): 597 link = m.group(1) 598 if m.group(2) == '@': 599 is_email = True 600 else: 601 is_email = False 602 return self.renderer.autolink(link, is_email) 603 604 def output_url(self, m): 605 link = m.group(1) 606 if self._in_link: 607 return self.renderer.text(link) 608 return self.renderer.autolink(link, False) 609 610 def output_inline_html(self, m): 611 tag = m.group(1) 612 if self._parse_inline_html and tag in _inline_tags: 613 text = m.group(3) 614 if tag == 'a': 615 self._in_link = True 616 text = self.output(text, rules=self.inline_html_rules) 617 self._in_link = False 618 else: 619 text = self.output(text, rules=self.inline_html_rules) 620 extra = m.group(2) or '' 621 html = '<%s%s>%s</%s>' % (tag, extra, text, tag) 622 else: 623 html = m.group(0) 624 return self.renderer.inline_html(html) 625 626 def output_footnote(self, m): 627 key = _keyify(m.group(1)) 628 if key not in self.footnotes: 629 return None 630 if self.footnotes[key]: 631 return None 632 self.footnote_index += 1 633 self.footnotes[key] = self.footnote_index 634 return self.renderer.footnote_ref(key, self.footnote_index) 635 636 def output_link(self, m): 637 return self._process_link(m, m.group(3), m.group(4)) 638 639 def output_reflink(self, m): 640 key = _keyify(m.group(2) or m.group(1)) 641 if key not in self.links: 642 return None 643 ret = self.links[key] 644 return self._process_link(m, ret['link'], ret['title']) 645 646 def output_nolink(self, m): 647 key = _keyify(m.group(1)) 648 if key not in self.links: 649 return None 650 ret = self.links[key] 651 return self._process_link(m, ret['link'], ret['title']) 652 653 def _process_link(self, m, link, title=None): 654 line = m.group(0) 655 text = m.group(1) 656 if line[0] == '!': 657 return self.renderer.image(link, title, text) 658 659 self._in_link = True 660 text = self.output(text) 661 self._in_link = False 662 return self.renderer.link(link, title, text) 663 664 def output_double_emphasis(self, m): 665 text = m.group(2) or m.group(1) 666 text = self.output(text) 667 return self.renderer.double_emphasis(text) 668 669 def output_emphasis(self, m): 670 text = m.group(2) or m.group(1) 671 text = self.output(text) 672 return self.renderer.emphasis(text) 673 674 def output_code(self, m): 675 text = m.group(2) 676 return self.renderer.codespan(text) 677 678 def output_linebreak(self, m): 679 return self.renderer.linebreak() 680 681 def output_strikethrough(self, m): 682 text = self.output(m.group(1)) 683 return self.renderer.strikethrough(text) 684 685 def output_text(self, m): 686 text = m.group(0) 687 return self.renderer.text(text) 688 689 690 class Renderer(object): 691 """The default HTML renderer for rendering Markdown. 692 """ 693 694 def __init__(self, **kwargs): 695 self.options = kwargs 696 697 def placeholder(self): 698 """Returns the default, empty output value for the renderer. 699 All renderer methods use the '+=' operator to append to this value. 700 Default is a string so rendering HTML can build up a result string with 701 the rendered Markdown. 702 Can be overridden by Renderer subclasses to be types like an empty 703 list, allowing the renderer to create a tree-like structure to 704 represent the document (which can then be reprocessed later into a 705 separate format like docx or pdf). 706 """ 707 return '' 708 709 def block_code(self, code, lang=None): 710 """Rendering block level code. ``pre > code``. 711 :param code: text content of the code block. 712 :param lang: language of the given code. 713 """ 714 code = code.rstrip('\n') 715 if not lang: 716 code = escape(code, smart_amp=False) 717 return '<pre><code>%s\n</code></pre>\n' % code 718 code = escape(code, quote=True, smart_amp=False) 719 return '<pre><code class="lang-%s">%s\n</code></pre>\n' % (lang, code) 720 721 def block_quote(self, text): 722 """Rendering <blockquote> with the given text. 723 :param text: text content of the blockquote. 724 """ 725 return '<blockquote>%s\n</blockquote>\n' % text.rstrip('\n') 726 727 def block_html(self, html): 728 """Rendering block level pure html content. 729 :param html: text content of the html snippet. 730 """ 731 if self.options.get('skip_style') and \ 732 html.lower().startswith('<style'): 733 return '' 734 if self.options.get('escape'): 735 return escape(html) 736 return html 737 738 def header(self, text, level, raw=None): 739 """Rendering header/heading tags like ``<h1>`` ``<h2>``. 740 :param text: rendered text content for the header. 741 :param level: a number for the header level, for example: 1. 742 :param raw: raw text content of the header. 743 """ 744 return '<h%d>%s</h%d>\n' % (level, text, level) 745 746 def hrule(self): 747 """Rendering method for ``<hr>`` tag.""" 748 if self.options.get('use_xhtml'): 749 return '<hr />\n' 750 return '<hr>\n' 751 752 def list(self, body, ordered=True): 753 """Rendering list tags like ``<ul>`` and ``<ol>``. 754 :param body: body contents of the list. 755 :param ordered: whether this list is ordered or not. 756 """ 757 tag = 'ul' 758 if ordered: 759 tag = 'ol' 760 return '<%s>\n%s</%s>\n' % (tag, body, tag) 761 762 def list_item(self, text): 763 """Rendering list item snippet. Like ``<li>``.""" 764 return '<li>%s</li>\n' % text 765 766 def paragraph(self, text): 767 """Rendering paragraph tags. Like ``<p>``.""" 768 return '<p>%s</p>\n' % text.strip(' ') 769 770 def table(self, header, body): 771 """Rendering table element. Wrap header and body in it. 772 :param header: header part of the table. 773 :param body: body part of the table. 774 """ 775 return ( 776 '<table>\n<thead>%s</thead>\n' 777 '<tbody>\n%s</tbody>\n</table>\n' 778 ) % (header, body) 779 780 def table_row(self, content): 781 """Rendering a table row. Like ``<tr>``. 782 :param content: content of current table row. 783 """ 784 return '<tr>\n%s</tr>\n' % content 785 786 def table_cell(self, content, **flags): 787 """Rendering a table cell. Like ``<th>`` ``<td>``. 788 :param content: content of current table cell. 789 :param header: whether this is header or not. 790 :param align: align of current table cell. 791 """ 792 if flags['header']: 793 tag = 'th' 794 else: 795 tag = 'td' 796 align = flags['align'] 797 if not align: 798 return '<%s>%s</%s>\n' % (tag, content, tag) 799 return '<%s style="text-align:%s">%s</%s>\n' % ( 800 tag, align, content, tag 801 ) 802 803 def double_emphasis(self, text): 804 """Rendering **strong** text. 805 :param text: text content for emphasis. 806 """ 807 return '<strong>%s</strong>' % text 808 809 def emphasis(self, text): 810 """Rendering *emphasis* text. 811 :param text: text content for emphasis. 812 """ 813 return '<em>%s</em>' % text 814 815 def codespan(self, text): 816 """Rendering inline `code` text. 817 :param text: text content for inline code. 818 """ 819 text = escape(text.rstrip(), smart_amp=False) 820 return '<code>%s</code>' % text 821 822 def linebreak(self): 823 """Rendering line break like ``<br>``.""" 824 if self.options.get('use_xhtml'): 825 return '<br />\n' 826 return '<br>\n' 827 828 def strikethrough(self, text): 829 """Rendering ~~strikethrough~~ text. 830 :param text: text content for strikethrough. 831 """ 832 return '<del>%s</del>' % text 833 834 def text(self, text): 835 """Rendering unformatted text. 836 :param text: text content. 837 """ 838 if self.options.get('parse_block_html'): 839 return text 840 return escape(text) 841 842 def escape(self, text): 843 """Rendering escape sequence. 844 :param text: text content. 845 """ 846 return escape(text) 847 848 def autolink(self, link, is_email=False): 849 """Rendering a given link or email address. 850 :param link: link content or email address. 851 :param is_email: whether this is an email or not. 852 """ 853 text = link = escape_link(link) 854 if is_email: 855 link = 'mailto:%s' % link 856 return '<a href="%s">%s</a>' % (link, text) 857 858 def link(self, link, title, text): 859 """Rendering a given link with content and title. 860 :param link: href link for ``<a>`` tag. 861 :param title: title content for `title` attribute. 862 :param text: text content for description. 863 """ 864 link = escape_link(link) 865 if not title: 866 return '<a href="%s">%s</a>' % (link, text) 867 title = escape(title, quote=True) 868 return '<a href="%s" title="%s">%s</a>' % (link, title, text) 869 870 def image(self, src, title, text): 871 """Rendering a image with title and text. 872 :param src: source link of the image. 873 :param title: title text of the image. 874 :param text: alt text of the image. 875 """ 876 src = escape_link(src) 877 text = escape(text, quote=True) 878 if title: 879 title = escape(title, quote=True) 880 html = '<img src="%s" alt="%s" title="%s"' % (src, text, title) 881 else: 882 html = '<img src="%s" alt="%s"' % (src, text) 883 if self.options.get('use_xhtml'): 884 return '%s />' % html 885 return '%s>' % html 886 887 def inline_html(self, html): 888 """Rendering span level pure html content. 889 :param html: text content of the html snippet. 890 """ 891 if self.options.get('escape'): 892 return escape(html) 893 return html 894 895 def newline(self): 896 """Rendering newline element.""" 897 return '' 898 899 def footnote_ref(self, key, index): 900 """Rendering the ref anchor of a footnote. 901 :param key: identity key for the footnote. 902 :param index: the index count of current footnote. 903 """ 904 html = ( 905 '<sup class="footnote-ref" id="fnref-%s">' 906 '<a href="#fn-%s">%d</a></sup>' 907 ) % (escape(key), escape(key), index) 908 return html 909 910 def footnote_item(self, key, text): 911 """Rendering a footnote item. 912 :param key: identity key for the footnote. 913 :param text: text content of the footnote. 914 """ 915 back = ( 916 '<a href="#fnref-%s" class="footnote">↩</a>' 917 ) % escape(key) 918 text = text.rstrip() 919 if text.endswith('</p>'): 920 text = re.sub(r'<\/p>$', r'%s</p>' % back, text) 921 else: 922 text = '%s<p>%s</p>' % (text, back) 923 html = '<li id="fn-%s">%s</li>\n' % (escape(key), text) 924 return html 925 926 def footnotes(self, text): 927 """Wrapper for all footnotes. 928 :param text: contents of all footnotes. 929 """ 930 html = '<div class="footnotes">\n%s<ol>%s</ol>\n</div>\n' 931 return html % (self.hrule(), text) 932 933 934 class Markdown(object): 935 """The Markdown parser. 936 :param renderer: An instance of ``Renderer``. 937 :param inline: An inline lexer class or instance. 938 :param block: A block lexer class or instance. 939 """ 940 def __init__(self, renderer=None, inline=None, block=None, **kwargs): 941 if not renderer: 942 renderer = Renderer(**kwargs) 943 else: 944 kwargs.update(renderer.options) 945 946 self.renderer = renderer 947 948 if inline and inspect.isclass(inline): 949 inline = inline(renderer, **kwargs) 950 if block and inspect.isclass(block): 951 block = block(**kwargs) 952 953 if inline: 954 self.inline = inline 955 else: 956 self.inline = InlineLexer(renderer, **kwargs) 957 958 self.block = block or BlockLexer(BlockGrammar()) 959 self.footnotes = [] 960 self.tokens = [] 961 962 # detect if it should parse text in block html 963 self._parse_block_html = kwargs.get('parse_block_html') 964 965 def __call__(self, text): 966 return self.parse(text) 967 968 def render(self, text): 969 """Render the Markdown text. 970 :param text: markdown formatted text content. 971 """ 972 return self.parse(text) 973 974 def parse(self, text): 975 out = self.output(preprocessing(text)) 976 977 keys = self.block.def_footnotes 978 979 # reset block 980 self.block.def_links = {} 981 self.block.def_footnotes = {} 982 983 # reset inline 984 self.inline.links = {} 985 self.inline.footnotes = {} 986 987 if not self.footnotes: 988 return out 989 990 footnotes = filter(lambda o: keys.get(o['key']), self.footnotes) 991 self.footnotes = sorted( 992 footnotes, key=lambda o: keys.get(o['key']), reverse=True 993 ) 994 995 body = self.renderer.placeholder() 996 while self.footnotes: 997 note = self.footnotes.pop() 998 body += self.renderer.footnote_item( 999 note['key'], note['text'] 1000 ) 1001 1002 out += self.renderer.footnotes(body) 1003 return out 1004 1005 def pop(self): 1006 if not self.tokens: 1007 return None 1008 self.token = self.tokens.pop() 1009 return self.token 1010 1011 def peek(self): 1012 if self.tokens: 1013 return self.tokens[-1] 1014 return None # pragma: no cover 1015 1016 def output(self, text, rules=None): 1017 self.tokens = self.block(text, rules) 1018 self.tokens.reverse() 1019 1020 self.inline.setup(self.block.def_links, self.block.def_footnotes) 1021 1022 out = self.renderer.placeholder() 1023 while self.pop(): 1024 out += self.tok() 1025 return out 1026 1027 def tok(self): 1028 t = self.token['type'] 1029 1030 # sepcial cases 1031 if t.endswith('_start'): 1032 t = t[:-6] 1033 1034 return getattr(self, 'output_%s' % t)() 1035 1036 def tok_text(self): 1037 text = self.token['text'] 1038 while self.peek()['type'] == 'text': 1039 text += '\n' + self.pop()['text'] 1040 return self.inline(text) 1041 1042 def output_newline(self): 1043 return self.renderer.newline() 1044 1045 def output_hrule(self): 1046 return self.renderer.hrule() 1047 1048 def output_heading(self): 1049 return self.renderer.header( 1050 self.inline(self.token['text']), 1051 self.token['level'], 1052 self.token['text'], 1053 ) 1054 1055 def output_code(self): 1056 return self.renderer.block_code( 1057 self.token['text'], self.token['lang'] 1058 ) 1059 1060 def output_table(self): 1061 aligns = self.token['align'] 1062 aligns_length = len(aligns) 1063 cell = self.renderer.placeholder() 1064 1065 # header part 1066 header = self.renderer.placeholder() 1067 for i, value in enumerate(self.token['header']): 1068 align = aligns[i] if i < aligns_length else None 1069 flags = {'header': True, 'align': align} 1070 cell += self.renderer.table_cell(self.inline(value), **flags) 1071 1072 header += self.renderer.table_row(cell) 1073 1074 # body part 1075 body = self.renderer.placeholder() 1076 for i, row in enumerate(self.token['cells']): 1077 cell = self.renderer.placeholder() 1078 for j, value in enumerate(row): 1079 align = aligns[j] if j < aligns_length else None 1080 flags = {'header': False, 'align': align} 1081 cell += self.renderer.table_cell(self.inline(value), **flags) 1082 body += self.renderer.table_row(cell) 1083 1084 return self.renderer.table(header, body) 1085 1086 def output_block_quote(self): 1087 body = self.renderer.placeholder() 1088 while self.pop()['type'] != 'block_quote_end': 1089 body += self.tok() 1090 return self.renderer.block_quote(body) 1091 1092 def output_list(self): 1093 ordered = self.token['ordered'] 1094 body = self.renderer.placeholder() 1095 while self.pop()['type'] != 'list_end': 1096 body += self.tok() 1097 return self.renderer.list(body, ordered) 1098 1099 def output_list_item(self): 1100 body = self.renderer.placeholder() 1101 while self.pop()['type'] != 'list_item_end': 1102 if self.token['type'] == 'text': 1103 body += self.tok_text() 1104 else: 1105 body += self.tok() 1106 1107 return self.renderer.list_item(body) 1108 1109 def output_loose_item(self): 1110 body = self.renderer.placeholder() 1111 while self.pop()['type'] != 'list_item_end': 1112 body += self.tok() 1113 return self.renderer.list_item(body) 1114 1115 def output_footnote(self): 1116 self.inline._in_footnote = True 1117 body = self.renderer.placeholder() 1118 key = self.token['key'] 1119 while self.pop()['type'] != 'footnote_end': 1120 body += self.tok() 1121 self.footnotes.append({'key': key, 'text': body}) 1122 self.inline._in_footnote = False 1123 return self.renderer.placeholder() 1124 1125 def output_close_html(self): 1126 text = self.token['text'] 1127 return self.renderer.block_html(text) 1128 1129 def output_open_html(self): 1130 text = self.token['text'] 1131 tag = self.token['tag'] 1132 if self._parse_block_html and tag not in _pre_tags: 1133 text = self.inline(text, rules=self.inline.inline_html_rules) 1134 extra = self.token.get('extra') or '' 1135 html = '<%s%s>%s</%s>' % (tag, extra, text, tag) 1136 return self.renderer.block_html(html) 1137 1138 def output_paragraph(self): 1139 return self.renderer.paragraph(self.inline(self.token['text'])) 1140 1141 def output_text(self): 1142 return self.renderer.paragraph(self.tok_text()) 1143 1144 1145 def markdown(text, escape=True, **kwargs): 1146 """Render markdown formatted text to html. 1147 :param text: markdown formatted text content. 1148 :param escape: if set to False, all html tags will not be escaped. 1149 :param use_xhtml: output with xhtml tags. 1150 :param hard_wrap: if set to True, it will use the GFM line breaks feature. 1151 :param parse_block_html: parse text only in block level html. 1152 :param parse_inline_html: parse text only in inline level html. 1153 """ 1154 return Markdown(escape=escape, **kwargs)(text)