serializer.pxi
1 # XML serialization and output functions 2 3 cdef object GzipFile 4 from gzip import GzipFile 5 6 7 cdef class SerialisationError(LxmlError): 8 """A libxml2 error that occurred during serialisation. 9 """ 10 11 12 cdef enum _OutputMethods: 13 OUTPUT_METHOD_XML 14 OUTPUT_METHOD_HTML 15 OUTPUT_METHOD_TEXT 16 17 18 cdef int _findOutputMethod(method) except -1: 19 if method is None: 20 return OUTPUT_METHOD_XML 21 method = method.lower() 22 if method == "xml": 23 return OUTPUT_METHOD_XML 24 if method == "html": 25 return OUTPUT_METHOD_HTML 26 if method == "text": 27 return OUTPUT_METHOD_TEXT 28 raise ValueError(f"unknown output method {method!r}") 29 30 31 cdef _textToString(xmlNode* c_node, encoding, bint with_tail): 32 cdef bint needs_conversion 33 cdef const_xmlChar* c_text 34 cdef xmlNode* c_text_node 35 cdef tree.xmlBuffer* c_buffer 36 cdef int error_result 37 38 c_buffer = tree.xmlBufferCreate() 39 if c_buffer is NULL: 40 raise MemoryError() 41 42 with nogil: 43 error_result = tree.xmlNodeBufGetContent(c_buffer, c_node) 44 if with_tail: 45 c_text_node = _textNodeOrSkip(c_node.next) 46 while c_text_node is not NULL: 47 tree.xmlBufferWriteChar(c_buffer, <const_char*>c_text_node.content) 48 c_text_node = _textNodeOrSkip(c_text_node.next) 49 c_text = tree.xmlBufferContent(c_buffer) 50 51 if error_result < 0 or c_text is NULL: 52 tree.xmlBufferFree(c_buffer) 53 raise SerialisationError, u"Error during serialisation (out of memory?)" 54 55 try: 56 needs_conversion = 0 57 if encoding is unicode: 58 needs_conversion = 1 59 elif encoding is not None: 60 # Python prefers lower case encoding names 61 encoding = encoding.lower() 62 if encoding not in (u'utf8', u'utf-8'): 63 if encoding == u'ascii': 64 if isutf8l(c_text, tree.xmlBufferLength(c_buffer)): 65 # will raise a decode error below 66 needs_conversion = 1 67 else: 68 needs_conversion = 1 69 70 if needs_conversion: 71 text = (<const_char*>c_text)[:tree.xmlBufferLength(c_buffer)].decode('utf8') 72 if encoding is not unicode: 73 encoding = _utf8(encoding) 74 text = python.PyUnicode_AsEncodedString( 75 text, encoding, 'strict') 76 else: 77 text = (<unsigned char*>c_text)[:tree.xmlBufferLength(c_buffer)] 78 finally: 79 tree.xmlBufferFree(c_buffer) 80 return text 81 82 83 cdef _tostring(_Element element, encoding, doctype, method, 84 bint write_xml_declaration, bint write_complete_document, 85 bint pretty_print, bint with_tail, int standalone): 86 u"""Serialize an element to an encoded string representation of its XML 87 tree. 88 """ 89 cdef tree.xmlOutputBuffer* c_buffer 90 cdef tree.xmlBuf* c_result_buffer 91 cdef tree.xmlCharEncodingHandler* enchandler 92 cdef const_char* c_enc 93 cdef const_xmlChar* c_version 94 cdef const_xmlChar* c_doctype 95 cdef int c_method 96 cdef int error_result 97 if element is None: 98 return None 99 _assertValidNode(element) 100 c_method = _findOutputMethod(method) 101 if c_method == OUTPUT_METHOD_TEXT: 102 return _textToString(element._c_node, encoding, with_tail) 103 if encoding is None or encoding is unicode: 104 c_enc = NULL 105 else: 106 encoding = _utf8(encoding) 107 c_enc = _cstr(encoding) 108 if doctype is None: 109 c_doctype = NULL 110 else: 111 doctype = _utf8(doctype) 112 c_doctype = _xcstr(doctype) 113 # it is necessary to *and* find the encoding handler *and* use 114 # encoding during output 115 enchandler = tree.xmlFindCharEncodingHandler(c_enc) 116 if enchandler is NULL and c_enc is not NULL: 117 if encoding is not None: 118 encoding = encoding.decode('UTF-8') 119 raise LookupError, f"unknown encoding: '{encoding}'" 120 c_buffer = tree.xmlAllocOutputBuffer(enchandler) 121 if c_buffer is NULL: 122 tree.xmlCharEncCloseFunc(enchandler) 123 raise MemoryError() 124 125 with nogil: 126 _writeNodeToBuffer(c_buffer, element._c_node, c_enc, c_doctype, c_method, 127 write_xml_declaration, write_complete_document, 128 pretty_print, with_tail, standalone) 129 tree.xmlOutputBufferFlush(c_buffer) 130 if c_buffer.conv is not NULL: 131 c_result_buffer = c_buffer.conv 132 else: 133 c_result_buffer = c_buffer.buffer 134 135 error_result = c_buffer.error 136 if error_result != xmlerror.XML_ERR_OK: 137 tree.xmlOutputBufferClose(c_buffer) 138 _raiseSerialisationError(error_result) 139 140 try: 141 if encoding is unicode: 142 result = (<unsigned char*>tree.xmlBufContent( 143 c_result_buffer))[:tree.xmlBufUse(c_result_buffer)].decode('UTF-8') 144 else: 145 result = <bytes>(<unsigned char*>tree.xmlBufContent( 146 c_result_buffer))[:tree.xmlBufUse(c_result_buffer)] 147 finally: 148 error_result = tree.xmlOutputBufferClose(c_buffer) 149 if error_result == -1: 150 _raiseSerialisationError(error_result) 151 return result 152 153 cdef bytes _tostringC14N(element_or_tree, bint exclusive, bint with_comments, inclusive_ns_prefixes): 154 cdef xmlDoc* c_doc 155 cdef xmlChar* c_buffer = NULL 156 cdef int byte_count = -1 157 cdef bytes result 158 cdef _Document doc 159 cdef _Element element 160 cdef xmlChar **c_inclusive_ns_prefixes 161 162 if isinstance(element_or_tree, _Element): 163 _assertValidNode(<_Element>element_or_tree) 164 doc = (<_Element>element_or_tree)._doc 165 c_doc = _plainFakeRootDoc(doc._c_doc, (<_Element>element_or_tree)._c_node, 0) 166 else: 167 doc = _documentOrRaise(element_or_tree) 168 _assertValidDoc(doc) 169 c_doc = doc._c_doc 170 171 c_inclusive_ns_prefixes = _convert_ns_prefixes(c_doc.dict, inclusive_ns_prefixes) if inclusive_ns_prefixes else NULL 172 try: 173 with nogil: 174 byte_count = c14n.xmlC14NDocDumpMemory( 175 c_doc, NULL, exclusive, c_inclusive_ns_prefixes, with_comments, &c_buffer) 176 177 finally: 178 _destroyFakeDoc(doc._c_doc, c_doc) 179 if c_inclusive_ns_prefixes is not NULL: 180 python.lxml_free(c_inclusive_ns_prefixes) 181 182 if byte_count < 0 or c_buffer is NULL: 183 if c_buffer is not NULL: 184 tree.xmlFree(c_buffer) 185 raise C14NError, u"C14N failed" 186 try: 187 result = c_buffer[:byte_count] 188 finally: 189 tree.xmlFree(c_buffer) 190 return result 191 192 cdef _raiseSerialisationError(int error_result): 193 if error_result == xmlerror.XML_ERR_NO_MEMORY: 194 raise MemoryError() 195 message = ErrorTypes._getName(error_result) 196 if message is None: 197 message = f"unknown error {error_result}" 198 raise SerialisationError, message 199 200 ############################################################ 201 # low-level serialisation functions 202 203 cdef void _writeDoctype(tree.xmlOutputBuffer* c_buffer, 204 const_xmlChar* c_doctype) nogil: 205 tree.xmlOutputBufferWrite(c_buffer, tree.xmlStrlen(c_doctype), 206 <const_char*>c_doctype) 207 tree.xmlOutputBufferWriteString(c_buffer, "\n") 208 209 cdef void _writeNodeToBuffer(tree.xmlOutputBuffer* c_buffer, 210 xmlNode* c_node, const_char* encoding, const_xmlChar* c_doctype, 211 int c_method, bint write_xml_declaration, 212 bint write_complete_document, 213 bint pretty_print, bint with_tail, 214 int standalone) nogil: 215 cdef xmlNode* c_nsdecl_node 216 cdef xmlDoc* c_doc = c_node.doc 217 if write_xml_declaration and c_method == OUTPUT_METHOD_XML: 218 _writeDeclarationToBuffer(c_buffer, c_doc.version, encoding, standalone) 219 220 # comments/processing instructions before doctype declaration 221 if write_complete_document and not c_buffer.error and c_doc.intSubset: 222 _writePrevSiblings(c_buffer, <xmlNode*>c_doc.intSubset, encoding, pretty_print) 223 224 if c_doctype: 225 _writeDoctype(c_buffer, c_doctype) 226 # write internal DTD subset, preceding PIs/comments, etc. 227 if write_complete_document and not c_buffer.error: 228 if c_doctype is NULL: 229 _writeDtdToBuffer(c_buffer, c_doc, c_node.name, c_method, encoding) 230 _writePrevSiblings(c_buffer, c_node, encoding, pretty_print) 231 232 c_nsdecl_node = c_node 233 if not c_node.parent or c_node.parent.type != tree.XML_DOCUMENT_NODE: 234 # copy the node and add namespaces from parents 235 # this is required to make libxml write them 236 c_nsdecl_node = tree.xmlCopyNode(c_node, 2) 237 if not c_nsdecl_node: 238 c_buffer.error = xmlerror.XML_ERR_NO_MEMORY 239 return 240 _copyParentNamespaces(c_node, c_nsdecl_node) 241 242 c_nsdecl_node.parent = c_node.parent 243 c_nsdecl_node.children = c_node.children 244 c_nsdecl_node.last = c_node.last 245 246 # write node 247 if c_method == OUTPUT_METHOD_HTML: 248 tree.htmlNodeDumpFormatOutput( 249 c_buffer, c_doc, c_nsdecl_node, encoding, pretty_print) 250 else: 251 tree.xmlNodeDumpOutput( 252 c_buffer, c_doc, c_nsdecl_node, 0, pretty_print, encoding) 253 254 if c_nsdecl_node is not c_node: 255 # clean up 256 c_nsdecl_node.children = c_nsdecl_node.last = NULL 257 tree.xmlFreeNode(c_nsdecl_node) 258 259 if c_buffer.error: 260 return 261 262 # write tail, trailing comments, etc. 263 if with_tail: 264 _writeTail(c_buffer, c_node, encoding, c_method, pretty_print) 265 if write_complete_document: 266 _writeNextSiblings(c_buffer, c_node, encoding, pretty_print) 267 if pretty_print: 268 tree.xmlOutputBufferWrite(c_buffer, 1, "\n") 269 270 cdef void _writeDeclarationToBuffer(tree.xmlOutputBuffer* c_buffer, 271 const_xmlChar* version, const_char* encoding, 272 int standalone) nogil: 273 if version is NULL: 274 version = <unsigned char*>"1.0" 275 tree.xmlOutputBufferWrite(c_buffer, 15, "<?xml version='") 276 tree.xmlOutputBufferWriteString(c_buffer, <const_char*>version) 277 tree.xmlOutputBufferWrite(c_buffer, 12, "' encoding='") 278 tree.xmlOutputBufferWriteString(c_buffer, encoding) 279 if standalone == 0: 280 tree.xmlOutputBufferWrite(c_buffer, 20, "' standalone='no'?>\n") 281 elif standalone == 1: 282 tree.xmlOutputBufferWrite(c_buffer, 21, "' standalone='yes'?>\n") 283 else: 284 tree.xmlOutputBufferWrite(c_buffer, 4, "'?>\n") 285 286 cdef void _writeDtdToBuffer(tree.xmlOutputBuffer* c_buffer, 287 xmlDoc* c_doc, const_xmlChar* c_root_name, 288 int c_method, const_char* encoding) nogil: 289 cdef tree.xmlDtd* c_dtd 290 cdef xmlNode* c_node 291 cdef char* quotechar 292 c_dtd = c_doc.intSubset 293 if not c_dtd or not c_dtd.name: 294 return 295 296 # Name in document type declaration must match the root element tag. 297 # For XML, case sensitive match, for HTML insensitive. 298 if c_method == OUTPUT_METHOD_HTML: 299 if tree.xmlStrcasecmp(c_root_name, c_dtd.name) != 0: 300 return 301 else: 302 if tree.xmlStrcmp(c_root_name, c_dtd.name) != 0: 303 return 304 305 tree.xmlOutputBufferWrite(c_buffer, 10, "<!DOCTYPE ") 306 tree.xmlOutputBufferWriteString(c_buffer, <const_char*>c_dtd.name) 307 308 cdef const_xmlChar* public_id = c_dtd.ExternalID 309 cdef const_xmlChar* sys_url = c_dtd.SystemID 310 if public_id and public_id[0] == b'\0': 311 public_id = NULL 312 if sys_url and sys_url[0] == b'\0': 313 sys_url = NULL 314 315 if public_id: 316 tree.xmlOutputBufferWrite(c_buffer, 9, ' PUBLIC "') 317 tree.xmlOutputBufferWriteString(c_buffer, <const_char*>public_id) 318 if sys_url: 319 tree.xmlOutputBufferWrite(c_buffer, 2, '" ') 320 else: 321 tree.xmlOutputBufferWrite(c_buffer, 1, '"') 322 elif sys_url: 323 tree.xmlOutputBufferWrite(c_buffer, 8, ' SYSTEM ') 324 325 if sys_url: 326 if tree.xmlStrchr(sys_url, b'"'): 327 quotechar = '\'' 328 else: 329 quotechar = '"' 330 tree.xmlOutputBufferWrite(c_buffer, 1, quotechar) 331 tree.xmlOutputBufferWriteString(c_buffer, <const_char*>sys_url) 332 tree.xmlOutputBufferWrite(c_buffer, 1, quotechar) 333 334 if (not c_dtd.entities and not c_dtd.elements and 335 not c_dtd.attributes and not c_dtd.notations and 336 not c_dtd.pentities): 337 tree.xmlOutputBufferWrite(c_buffer, 2, '>\n') 338 return 339 340 tree.xmlOutputBufferWrite(c_buffer, 3, ' [\n') 341 if c_dtd.notations and not c_buffer.error: 342 c_buf = tree.xmlBufferCreate() 343 if not c_buf: 344 c_buffer.error = xmlerror.XML_ERR_NO_MEMORY 345 return 346 tree.xmlDumpNotationTable(c_buf, <tree.xmlNotationTable*>c_dtd.notations) 347 tree.xmlOutputBufferWrite( 348 c_buffer, tree.xmlBufferLength(c_buf), 349 <const_char*>tree.xmlBufferContent(c_buf)) 350 tree.xmlBufferFree(c_buf) 351 c_node = c_dtd.children 352 while c_node and not c_buffer.error: 353 tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_node, 0, 0, encoding) 354 c_node = c_node.next 355 tree.xmlOutputBufferWrite(c_buffer, 3, "]>\n") 356 357 cdef void _writeTail(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node, 358 const_char* encoding, int c_method, bint pretty_print) nogil: 359 u"Write the element tail." 360 c_node = c_node.next 361 while c_node and not c_buffer.error and c_node.type in ( 362 tree.XML_TEXT_NODE, tree.XML_CDATA_SECTION_NODE): 363 if c_method == OUTPUT_METHOD_HTML: 364 tree.htmlNodeDumpFormatOutput( 365 c_buffer, c_node.doc, c_node, encoding, pretty_print) 366 else: 367 tree.xmlNodeDumpOutput( 368 c_buffer, c_node.doc, c_node, 0, pretty_print, encoding) 369 c_node = c_node.next 370 371 cdef void _writePrevSiblings(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node, 372 const_char* encoding, bint pretty_print) nogil: 373 cdef xmlNode* c_sibling 374 if c_node.parent and _isElement(c_node.parent): 375 return 376 # we are at a root node, so add PI and comment siblings 377 c_sibling = c_node 378 while c_sibling.prev and \ 379 (c_sibling.prev.type == tree.XML_PI_NODE or 380 c_sibling.prev.type == tree.XML_COMMENT_NODE): 381 c_sibling = c_sibling.prev 382 while c_sibling is not c_node and not c_buffer.error: 383 tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_sibling, 0, 384 pretty_print, encoding) 385 if pretty_print: 386 tree.xmlOutputBufferWriteString(c_buffer, "\n") 387 c_sibling = c_sibling.next 388 389 cdef void _writeNextSiblings(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node, 390 const_char* encoding, bint pretty_print) nogil: 391 cdef xmlNode* c_sibling 392 if c_node.parent and _isElement(c_node.parent): 393 return 394 # we are at a root node, so add PI and comment siblings 395 c_sibling = c_node.next 396 while not c_buffer.error and c_sibling and \ 397 (c_sibling.type == tree.XML_PI_NODE or 398 c_sibling.type == tree.XML_COMMENT_NODE): 399 if pretty_print: 400 tree.xmlOutputBufferWriteString(c_buffer, "\n") 401 tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_sibling, 0, 402 pretty_print, encoding) 403 c_sibling = c_sibling.next 404 405 406 # copied and adapted from libxml2 407 cdef unsigned char *xmlSerializeHexCharRef(unsigned char *out, int val): 408 cdef xmlChar *ptr 409 cdef xmlChar c 410 411 out[0] = '&' 412 out += 1 413 414 out[0] = '#' 415 out += 1 416 417 out[0] = 'x' 418 out += 1 419 420 if val < 0x10: 421 ptr = out 422 elif val < 0x100: 423 ptr = out + 1 424 elif val < 0x1000: 425 ptr = out + 2 426 elif val < 0x10000: 427 ptr = out + 3 428 elif val < 0x100000: 429 ptr = out + 4 430 else: 431 ptr = out + 5 432 433 out = ptr + 1 434 while val > 0: 435 c = (val & 0xF) 436 437 if c == 0: 438 ptr[0] = '0' 439 elif c == 1: 440 ptr[0] = '1' 441 elif c == 2: 442 ptr[0] = '2' 443 elif c == 3: 444 ptr[0] = '3' 445 elif c == 4: 446 ptr[0] = '4' 447 elif c == 5: 448 ptr[0] = '5' 449 elif c == 6: 450 ptr[0] = '6' 451 elif c == 7: 452 ptr[0] = '7' 453 elif c == 8: 454 ptr[0] = '8' 455 elif c == 9: 456 ptr[0] = '9' 457 elif c == 0xA: 458 ptr[0] = 'A' 459 elif c == 0xB: 460 ptr[0] = 'B' 461 elif c == 0xC: 462 ptr[0] = 'C' 463 elif c == 0xD: 464 ptr[0] = 'D' 465 elif c == 0xE: 466 ptr[0] = 'E' 467 elif c == 0xF: 468 ptr[0] = 'F' 469 else: 470 ptr[0] = '0' 471 472 ptr -= 1 473 474 val >>= 4 475 476 out[0] = ';' 477 out += 1 478 out[0] = 0 479 480 return out 481 482 483 # copied and adapted from libxml2 (xmlBufAttrSerializeTxtContent()) 484 cdef _write_attr_string(tree.xmlOutputBuffer* buf, const char *string): 485 cdef const char *base 486 cdef const char *cur 487 cdef const unsigned char *ucur 488 489 cdef unsigned char tmp[12] 490 cdef int val = 0 491 cdef int l 492 493 if string == NULL: 494 return 495 496 base = cur = <const char*>string 497 while cur[0] != 0: 498 if cur[0] == '\n': 499 if base != cur: 500 tree.xmlOutputBufferWrite(buf, cur - base, base) 501 502 tree.xmlOutputBufferWrite(buf, 5, " ") 503 cur += 1 504 base = cur 505 506 elif cur[0] == '\r': 507 if base != cur: 508 tree.xmlOutputBufferWrite(buf, cur - base, base) 509 510 tree.xmlOutputBufferWrite(buf, 5, " ") 511 cur += 1 512 base = cur 513 514 elif cur[0] == '\t': 515 if base != cur: 516 tree.xmlOutputBufferWrite(buf, cur - base, base) 517 518 tree.xmlOutputBufferWrite(buf, 4, "	") 519 cur += 1 520 base = cur 521 522 elif cur[0] == '"': 523 if base != cur: 524 tree.xmlOutputBufferWrite(buf, cur - base, base) 525 526 tree.xmlOutputBufferWrite(buf, 6, """) 527 cur += 1 528 base = cur 529 530 elif cur[0] == '<': 531 if base != cur: 532 tree.xmlOutputBufferWrite(buf, cur - base, base) 533 534 tree.xmlOutputBufferWrite(buf, 4, "<") 535 cur += 1 536 base = cur 537 538 elif cur[0] == '>': 539 if base != cur: 540 tree.xmlOutputBufferWrite(buf, cur - base, base) 541 542 tree.xmlOutputBufferWrite(buf, 4, ">") 543 cur += 1 544 base = cur 545 elif cur[0] == '&': 546 if base != cur: 547 tree.xmlOutputBufferWrite(buf, cur - base, base) 548 549 tree.xmlOutputBufferWrite(buf, 5, "&") 550 cur += 1 551 base = cur 552 553 elif (<const unsigned char>cur[0] >= 0x80) and (cur[1] != 0): 554 555 if base != cur: 556 tree.xmlOutputBufferWrite(buf, cur - base, base) 557 558 ucur = <const unsigned char *>cur 559 560 if ucur[0] < 0xC0: 561 # invalid UTF-8 sequence 562 val = ucur[0] 563 l = 1 564 565 elif ucur[0] < 0xE0: 566 val = (ucur[0]) & 0x1F 567 val <<= 6 568 val |= (ucur[1]) & 0x3F 569 l = 2 570 571 elif (ucur[0] < 0xF0) and (ucur[2] != 0): 572 val = (ucur[0]) & 0x0F 573 val <<= 6 574 val |= (ucur[1]) & 0x3F 575 val <<= 6 576 val |= (ucur[2]) & 0x3F 577 l = 3 578 579 elif (ucur[0] < 0xF8) and (ucur[2] != 0) and (ucur[3] != 0): 580 val = (ucur[0]) & 0x07 581 val <<= 6 582 val |= (ucur[1]) & 0x3F 583 val <<= 6 584 val |= (ucur[2]) & 0x3F 585 val <<= 6 586 val |= (ucur[3]) & 0x3F 587 l = 4 588 else: 589 # invalid UTF-8 sequence 590 val = ucur[0] 591 l = 1 592 593 if (l == 1) or (not tree.xmlIsCharQ(val)): 594 raise ValueError(f"Invalid character: {val:X}") 595 596 # We could do multiple things here. Just save 597 # as a char ref 598 xmlSerializeHexCharRef(tmp, val) 599 tree.xmlOutputBufferWrite(buf, len(tmp), <const char*> tmp) 600 cur += l 601 base = cur 602 603 else: 604 cur += 1 605 606 if base != cur: 607 tree.xmlOutputBufferWrite(buf, cur - base, base) 608 609 610 ############################################################ 611 # output to file-like objects 612 613 cdef object io_open 614 from io import open 615 616 cdef object gzip 617 import gzip 618 619 cdef object getwriter 620 from codecs import getwriter 621 cdef object utf8_writer = getwriter('utf8') 622 623 cdef object contextmanager 624 from contextlib import contextmanager 625 626 cdef object _open_utf8_file 627 628 @contextmanager 629 def _open_utf8_file(file, compression=0): 630 file = _getFSPathOrObject(file) 631 if _isString(file): 632 if compression: 633 with gzip.GzipFile(file, mode='wb', compresslevel=compression) as zf: 634 yield utf8_writer(zf) 635 else: 636 with io_open(file, 'w', encoding='utf8') as f: 637 yield f 638 else: 639 if compression: 640 with gzip.GzipFile(fileobj=file, mode='wb', compresslevel=compression) as zf: 641 yield utf8_writer(zf) 642 else: 643 yield utf8_writer(file) 644 645 646 @cython.final 647 @cython.internal 648 cdef class _FilelikeWriter: 649 cdef object _filelike 650 cdef object _close_filelike 651 cdef _ExceptionContext _exc_context 652 cdef _ErrorLog error_log 653 def __cinit__(self, filelike, exc_context=None, compression=None, close=False): 654 if compression is not None and compression > 0: 655 filelike = GzipFile( 656 fileobj=filelike, mode='wb', compresslevel=compression) 657 self._close_filelike = filelike.close 658 elif close: 659 self._close_filelike = filelike.close 660 self._filelike = filelike 661 if exc_context is None: 662 self._exc_context = _ExceptionContext() 663 else: 664 self._exc_context = exc_context 665 self.error_log = _ErrorLog() 666 667 cdef tree.xmlOutputBuffer* _createOutputBuffer( 668 self, tree.xmlCharEncodingHandler* enchandler) except NULL: 669 cdef tree.xmlOutputBuffer* c_buffer 670 c_buffer = tree.xmlOutputBufferCreateIO( 671 <tree.xmlOutputWriteCallback>_writeFilelikeWriter, _closeFilelikeWriter, 672 <python.PyObject*>self, enchandler) 673 if c_buffer is NULL: 674 raise IOError, u"Could not create I/O writer context." 675 return c_buffer 676 677 cdef int write(self, char* c_buffer, int size): 678 try: 679 if self._filelike is None: 680 raise IOError, u"File is already closed" 681 py_buffer = <bytes>c_buffer[:size] 682 self._filelike.write(py_buffer) 683 except: 684 size = -1 685 self._exc_context._store_raised() 686 finally: 687 return size # and swallow any further exceptions 688 689 cdef int close(self): 690 retval = 0 691 try: 692 if self._close_filelike is not None: 693 self._close_filelike() 694 # we should not close the file here as we didn't open it 695 self._filelike = None 696 except: 697 retval = -1 698 self._exc_context._store_raised() 699 finally: 700 return retval # and swallow any further exceptions 701 702 cdef int _writeFilelikeWriter(void* ctxt, char* c_buffer, int length): 703 return (<_FilelikeWriter>ctxt).write(c_buffer, length) 704 705 cdef int _closeFilelikeWriter(void* ctxt): 706 return (<_FilelikeWriter>ctxt).close() 707 708 cdef _tofilelike(f, _Element element, encoding, doctype, method, 709 bint write_xml_declaration, bint write_doctype, 710 bint pretty_print, bint with_tail, int standalone, 711 int compression): 712 cdef _FilelikeWriter writer = None 713 cdef tree.xmlOutputBuffer* c_buffer 714 cdef tree.xmlCharEncodingHandler* enchandler 715 cdef const_char* c_enc 716 cdef const_xmlChar* c_doctype 717 cdef int error_result 718 719 c_method = _findOutputMethod(method) 720 if c_method == OUTPUT_METHOD_TEXT: 721 data = _textToString(element._c_node, encoding, with_tail) 722 if compression: 723 bytes_out = BytesIO() 724 with GzipFile(fileobj=bytes_out, mode='wb', compresslevel=compression) as gzip_file: 725 gzip_file.write(data) 726 data = bytes_out.getvalue() 727 f = _getFSPathOrObject(f) 728 if _isString(f): 729 filename8 = _encodeFilename(f) 730 with open(filename8, 'wb') as f: 731 f.write(data) 732 else: 733 f.write(data) 734 return 735 736 if encoding is None: 737 c_enc = NULL 738 else: 739 encoding = _utf8(encoding) 740 c_enc = _cstr(encoding) 741 if doctype is None: 742 c_doctype = NULL 743 else: 744 doctype = _utf8(doctype) 745 c_doctype = _xcstr(doctype) 746 747 writer = _create_output_buffer(f, c_enc, compression, &c_buffer, close=False) 748 if writer is None: 749 with nogil: 750 error_result = _serialise_node( 751 c_buffer, c_doctype, c_enc, element._c_node, c_method, 752 write_xml_declaration, write_doctype, pretty_print, with_tail, standalone) 753 else: 754 error_result = _serialise_node( 755 c_buffer, c_doctype, c_enc, element._c_node, c_method, 756 write_xml_declaration, write_doctype, pretty_print, with_tail, standalone) 757 758 if writer is not None: 759 writer._exc_context._raise_if_stored() 760 if error_result != xmlerror.XML_ERR_OK: 761 _raiseSerialisationError(error_result) 762 763 764 cdef int _serialise_node(tree.xmlOutputBuffer* c_buffer, const_xmlChar* c_doctype, 765 const_char* c_enc, xmlNode* c_node, int c_method, 766 bint write_xml_declaration, bint write_doctype, bint pretty_print, 767 bint with_tail, int standalone) nogil: 768 _writeNodeToBuffer( 769 c_buffer, c_node, c_enc, c_doctype, c_method, 770 write_xml_declaration, write_doctype, pretty_print, with_tail, standalone) 771 error_result = c_buffer.error 772 if error_result == xmlerror.XML_ERR_OK: 773 error_result = tree.xmlOutputBufferClose(c_buffer) 774 if error_result != -1: 775 error_result = xmlerror.XML_ERR_OK 776 else: 777 tree.xmlOutputBufferClose(c_buffer) 778 return error_result 779 780 781 cdef _FilelikeWriter _create_output_buffer( 782 f, const_char* c_enc, int c_compression, 783 tree.xmlOutputBuffer** c_buffer_ret, bint close): 784 cdef tree.xmlOutputBuffer* c_buffer 785 cdef _FilelikeWriter writer 786 cdef bytes filename8 787 enchandler = tree.xmlFindCharEncodingHandler(c_enc) 788 if enchandler is NULL: 789 raise LookupError( 790 f"unknown encoding: '{c_enc.decode('UTF-8') if c_enc is not NULL else u''}'") 791 try: 792 f = _getFSPathOrObject(f) 793 if _isString(f): 794 filename8 = _encodeFilename(f) 795 if b'%' in filename8 and ( 796 # Exclude absolute Windows paths and file:// URLs. 797 _isFilePath(<const xmlChar*>filename8) not in (NO_FILE_PATH, ABS_WIN_FILE_PATH) 798 or filename8[:7].lower() == b'file://'): 799 # A file path (not a URL) containing the '%' URL escape character. 800 # libxml2 uses URL-unescaping on these, so escape the path before passing it in. 801 filename8 = filename8.replace(b'%', b'%25') 802 c_buffer = tree.xmlOutputBufferCreateFilename( 803 _cstr(filename8), enchandler, c_compression) 804 if c_buffer is NULL: 805 python.PyErr_SetFromErrno(IOError) # raises IOError 806 writer = None 807 elif hasattr(f, 'write'): 808 writer = _FilelikeWriter(f, compression=c_compression, close=close) 809 c_buffer = writer._createOutputBuffer(enchandler) 810 else: 811 raise TypeError( 812 f"File or filename expected, got '{python._fqtypename(f).decode('UTF-8')}'") 813 except: 814 tree.xmlCharEncCloseFunc(enchandler) 815 raise 816 c_buffer_ret[0] = c_buffer 817 return writer 818 819 cdef xmlChar **_convert_ns_prefixes(tree.xmlDict* c_dict, ns_prefixes) except NULL: 820 cdef size_t i, num_ns_prefixes = len(ns_prefixes) 821 # Need to allocate one extra memory block to handle last NULL entry 822 c_ns_prefixes = <xmlChar **>python.lxml_malloc(num_ns_prefixes + 1, sizeof(xmlChar*)) 823 if not c_ns_prefixes: 824 raise MemoryError() 825 i = 0 826 try: 827 for prefix in ns_prefixes: 828 prefix_utf = _utf8(prefix) 829 c_prefix = tree.xmlDictExists(c_dict, _xcstr(prefix_utf), len(prefix_utf)) 830 if c_prefix: 831 # unknown prefixes do not need to get serialised 832 c_ns_prefixes[i] = <xmlChar*>c_prefix 833 i += 1 834 except: 835 python.lxml_free(c_ns_prefixes) 836 raise 837 838 c_ns_prefixes[i] = NULL # append end marker 839 return c_ns_prefixes 840 841 cdef _tofilelikeC14N(f, _Element element, bint exclusive, bint with_comments, 842 int compression, inclusive_ns_prefixes): 843 cdef _FilelikeWriter writer = None 844 cdef tree.xmlOutputBuffer* c_buffer 845 cdef xmlChar **c_inclusive_ns_prefixes = NULL 846 cdef char* c_filename 847 cdef xmlDoc* c_base_doc 848 cdef xmlDoc* c_doc 849 cdef int bytes_count, error = 0 850 851 c_base_doc = element._c_node.doc 852 c_doc = _fakeRootDoc(c_base_doc, element._c_node) 853 try: 854 c_inclusive_ns_prefixes = ( 855 _convert_ns_prefixes(c_doc.dict, inclusive_ns_prefixes) 856 if inclusive_ns_prefixes else NULL) 857 858 f = _getFSPathOrObject(f) 859 if _isString(f): 860 filename8 = _encodeFilename(f) 861 c_filename = _cstr(filename8) 862 with nogil: 863 error = c14n.xmlC14NDocSave( 864 c_doc, NULL, exclusive, c_inclusive_ns_prefixes, 865 with_comments, c_filename, compression) 866 elif hasattr(f, 'write'): 867 writer = _FilelikeWriter(f, compression=compression) 868 c_buffer = writer._createOutputBuffer(NULL) 869 try: 870 with writer.error_log: 871 bytes_count = c14n.xmlC14NDocSaveTo( 872 c_doc, NULL, exclusive, c_inclusive_ns_prefixes, 873 with_comments, c_buffer) 874 finally: 875 error = tree.xmlOutputBufferClose(c_buffer) 876 if bytes_count < 0: 877 error = bytes_count 878 elif error != -1: 879 error = xmlerror.XML_ERR_OK 880 else: 881 raise TypeError(f"File or filename expected, got '{python._fqtypename(f).decode('UTF-8')}'") 882 finally: 883 _destroyFakeDoc(c_base_doc, c_doc) 884 if c_inclusive_ns_prefixes is not NULL: 885 python.lxml_free(c_inclusive_ns_prefixes) 886 887 if writer is not None: 888 writer._exc_context._raise_if_stored() 889 890 if error < 0: 891 message = u"C14N failed" 892 if writer is not None: 893 errors = writer.error_log 894 if len(errors): 895 message = errors[0].message 896 raise C14NError(message) 897 898 899 # C14N 2.0 900 901 def canonicalize(xml_data=None, *, out=None, from_file=None, **options): 902 """Convert XML to its C14N 2.0 serialised form. 903 904 If *out* is provided, it must be a file or file-like object that receives 905 the serialised canonical XML output (text, not bytes) through its ``.write()`` 906 method. To write to a file, open it in text mode with encoding "utf-8". 907 If *out* is not provided, this function returns the output as text string. 908 909 Either *xml_data* (an XML string, tree or Element) or *file* 910 (a file path or file-like object) must be provided as input. 911 912 The configuration options are the same as for the ``C14NWriterTarget``. 913 """ 914 if xml_data is None and from_file is None: 915 raise ValueError("Either 'xml_data' or 'from_file' must be provided as input") 916 917 sio = None 918 if out is None: 919 sio = out = StringIO() 920 921 target = C14NWriterTarget(out.write, **options) 922 923 if xml_data is not None and not isinstance(xml_data, basestring): 924 _tree_to_target(xml_data, target) 925 return sio.getvalue() if sio is not None else None 926 927 cdef _FeedParser parser = XMLParser( 928 target=target, 929 attribute_defaults=True, 930 collect_ids=False, 931 ) 932 933 if xml_data is not None: 934 parser.feed(xml_data) 935 parser.close() 936 elif from_file is not None: 937 try: 938 _parseDocument(from_file, parser, base_url=None) 939 except _TargetParserResult: 940 pass 941 942 return sio.getvalue() if sio is not None else None 943 944 945 cdef _tree_to_target(element, target): 946 for event, elem in iterwalk(element, events=('start', 'end', 'start-ns', 'comment', 'pi')): 947 text = None 948 if event == 'start': 949 target.start(elem.tag, elem.attrib) 950 text = elem.text 951 elif event == 'end': 952 target.end(elem.tag) 953 text = elem.tail 954 elif event == 'start-ns': 955 target.start_ns(*elem) 956 continue 957 elif event == 'comment': 958 target.comment(elem.text) 959 text = elem.tail 960 elif event == 'pi': 961 target.pi(elem.target, elem.text) 962 text = elem.tail 963 if text: 964 target.data(text) 965 return target.close() 966 967 968 cdef object _looks_like_prefix_name = re.compile('^\w+:\w+$', re.UNICODE).match 969 970 971 cdef class C14NWriterTarget: 972 """ 973 Canonicalization writer target for the XMLParser. 974 975 Serialises parse events to XML C14N 2.0. 976 977 Configuration options: 978 979 - *with_comments*: set to true to include comments 980 - *strip_text*: set to true to strip whitespace before and after text content 981 - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}" 982 - *qname_aware_tags*: a set of qname aware tag names in which prefixes 983 should be replaced in text content 984 - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes 985 should be replaced in text content 986 - *exclude_attrs*: a set of attribute names that should not be serialised 987 - *exclude_tags*: a set of tag names that should not be serialised 988 """ 989 cdef object _write 990 cdef list _data 991 cdef set _qname_aware_tags 992 cdef object _find_qname_aware_attrs 993 cdef list _declared_ns_stack 994 cdef list _ns_stack 995 cdef dict _prefix_map 996 cdef list _preserve_space 997 cdef tuple _pending_start 998 cdef set _exclude_tags 999 cdef set _exclude_attrs 1000 cdef Py_ssize_t _ignored_depth 1001 cdef bint _with_comments 1002 cdef bint _strip_text 1003 cdef bint _rewrite_prefixes 1004 cdef bint _root_seen 1005 cdef bint _root_done 1006 1007 def __init__(self, write, *, 1008 with_comments=False, strip_text=False, rewrite_prefixes=False, 1009 qname_aware_tags=None, qname_aware_attrs=None, 1010 exclude_attrs=None, exclude_tags=None): 1011 self._write = write 1012 self._data = [] 1013 self._with_comments = with_comments 1014 self._strip_text = strip_text 1015 self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None 1016 self._exclude_tags = set(exclude_tags) if exclude_tags else None 1017 1018 self._rewrite_prefixes = rewrite_prefixes 1019 if qname_aware_tags: 1020 self._qname_aware_tags = set(qname_aware_tags) 1021 else: 1022 self._qname_aware_tags = None 1023 if qname_aware_attrs: 1024 self._find_qname_aware_attrs = set(qname_aware_attrs).intersection 1025 else: 1026 self._find_qname_aware_attrs = None 1027 1028 # Stack with globally and newly declared namespaces as (uri, prefix) pairs. 1029 self._declared_ns_stack = [[ 1030 ("http://www.w3.org/XML/1998/namespace", "xml"), 1031 ]] 1032 # Stack with user declared namespace prefixes as (uri, prefix) pairs. 1033 self._ns_stack = [] 1034 if not rewrite_prefixes: 1035 self._ns_stack.append(_DEFAULT_NAMESPACE_PREFIXES_ITEMS) 1036 self._ns_stack.append([]) 1037 self._prefix_map = {} 1038 self._preserve_space = [False] 1039 self._pending_start = None 1040 self._ignored_depth = 0 1041 self._root_seen = False 1042 self._root_done = False 1043 1044 def _iter_namespaces(self, ns_stack): 1045 for namespaces in reversed(ns_stack): 1046 if namespaces: # almost no element declares new namespaces 1047 yield from namespaces 1048 1049 cdef _resolve_prefix_name(self, prefixed_name): 1050 prefix, name = prefixed_name.split(':', 1) 1051 for uri, p in self._iter_namespaces(self._ns_stack): 1052 if p == prefix: 1053 return f'{{{uri}}}{name}' 1054 raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope') 1055 1056 cdef _qname(self, qname, uri=None): 1057 if uri is None: 1058 uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname) 1059 else: 1060 tag = qname 1061 1062 prefixes_seen = set() 1063 for u, prefix in self._iter_namespaces(self._declared_ns_stack): 1064 if u == uri and prefix not in prefixes_seen: 1065 return f'{prefix}:{tag}' if prefix else tag, tag, uri 1066 prefixes_seen.add(prefix) 1067 1068 # Not declared yet => add new declaration. 1069 if self._rewrite_prefixes: 1070 if uri in self._prefix_map: 1071 prefix = self._prefix_map[uri] 1072 else: 1073 prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}' 1074 self._declared_ns_stack[-1].append((uri, prefix)) 1075 return f'{prefix}:{tag}', tag, uri 1076 1077 if not uri and '' not in prefixes_seen: 1078 # No default namespace declared => no prefix needed. 1079 return tag, tag, uri 1080 1081 for u, prefix in self._iter_namespaces(self._ns_stack): 1082 if u == uri: 1083 self._declared_ns_stack[-1].append((uri, prefix)) 1084 return f'{prefix}:{tag}' if prefix else tag, tag, uri 1085 1086 if not uri: 1087 # As soon as a default namespace is defined, 1088 # anything that has no namespace (and thus, no prefix) goes there. 1089 return tag, tag, uri 1090 1091 raise ValueError(f'Namespace "{uri}" of name "{tag}" is not declared in scope') 1092 1093 def data(self, data): 1094 if not self._ignored_depth: 1095 self._data.append(data) 1096 1097 cdef _flush(self): 1098 data = u''.join(self._data) 1099 del self._data[:] 1100 if self._strip_text and not self._preserve_space[-1]: 1101 data = data.strip() 1102 if self._pending_start is not None: 1103 (tag, attrs, new_namespaces), self._pending_start = self._pending_start, None 1104 qname_text = data if u':' in data and _looks_like_prefix_name(data) else None 1105 self._start(tag, attrs, new_namespaces, qname_text) 1106 if qname_text is not None: 1107 return 1108 if data and self._root_seen: 1109 self._write(_escape_cdata_c14n(data)) 1110 1111 def start_ns(self, prefix, uri): 1112 if self._ignored_depth: 1113 return 1114 # we may have to resolve qnames in text content 1115 if self._data: 1116 self._flush() 1117 self._ns_stack[-1].append((uri, prefix)) 1118 1119 def start(self, tag, attrs): 1120 if self._exclude_tags is not None and ( 1121 self._ignored_depth or tag in self._exclude_tags): 1122 self._ignored_depth += 1 1123 return 1124 if self._data: 1125 self._flush() 1126 1127 new_namespaces = [] 1128 self._declared_ns_stack.append(new_namespaces) 1129 1130 if self._qname_aware_tags is not None and tag in self._qname_aware_tags: 1131 # Need to parse text first to see if it requires a prefix declaration. 1132 self._pending_start = (tag, attrs, new_namespaces) 1133 return 1134 self._start(tag, attrs, new_namespaces) 1135 1136 cdef _start(self, tag, attrs, new_namespaces, qname_text=None): 1137 if self._exclude_attrs is not None and attrs: 1138 attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs} 1139 1140 qnames = {tag, *attrs} 1141 resolved_names = {} 1142 1143 # Resolve prefixes in attribute and tag text. 1144 if qname_text is not None: 1145 qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text) 1146 qnames.add(qname) 1147 if self._find_qname_aware_attrs is not None and attrs: 1148 qattrs = self._find_qname_aware_attrs(attrs) 1149 if qattrs: 1150 for attr_name in qattrs: 1151 value = attrs[attr_name] 1152 if _looks_like_prefix_name(value): 1153 qname = resolved_names[value] = self._resolve_prefix_name(value) 1154 qnames.add(qname) 1155 else: 1156 qattrs = None 1157 else: 1158 qattrs = None 1159 1160 # Assign prefixes in lexicographical order of used URIs. 1161 parsed_qnames = {n: self._qname(n) for n in sorted( 1162 qnames, key=lambda n: n.split('}', 1))} 1163 1164 # Write namespace declarations in prefix order ... 1165 if new_namespaces: 1166 attr_list = [ 1167 (u'xmlns:' + prefix if prefix else u'xmlns', uri) 1168 for uri, prefix in new_namespaces 1169 ] 1170 attr_list.sort() 1171 else: 1172 # almost always empty 1173 attr_list = [] 1174 1175 # ... followed by attributes in URI+name order 1176 if attrs: 1177 for k, v in sorted(attrs.items()): 1178 if qattrs is not None and k in qattrs and v in resolved_names: 1179 v = parsed_qnames[resolved_names[v]][0] 1180 attr_qname, attr_name, uri = parsed_qnames[k] 1181 # No prefix for attributes in default ('') namespace. 1182 attr_list.append((attr_qname if uri else attr_name, v)) 1183 1184 # Honour xml:space attributes. 1185 space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space') 1186 self._preserve_space.append( 1187 space_behaviour == 'preserve' if space_behaviour 1188 else self._preserve_space[-1]) 1189 1190 # Write the tag. 1191 write = self._write 1192 write(u'<' + parsed_qnames[tag][0]) 1193 if attr_list: 1194 write(u''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list])) 1195 write(u'>') 1196 1197 # Write the resolved qname text content. 1198 if qname_text is not None: 1199 write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0])) 1200 1201 self._root_seen = True 1202 self._ns_stack.append([]) 1203 1204 def end(self, tag): 1205 if self._ignored_depth: 1206 self._ignored_depth -= 1 1207 return 1208 if self._data: 1209 self._flush() 1210 self._write(f'</{self._qname(tag)[0]}>') 1211 self._preserve_space.pop() 1212 self._root_done = len(self._preserve_space) == 1 1213 self._declared_ns_stack.pop() 1214 self._ns_stack.pop() 1215 1216 def comment(self, text): 1217 if not self._with_comments: 1218 return 1219 if self._ignored_depth: 1220 return 1221 if self._root_done: 1222 self._write(u'\n') 1223 elif self._root_seen and self._data: 1224 self._flush() 1225 self._write(f'<!--{_escape_cdata_c14n(text)}-->') 1226 if not self._root_seen: 1227 self._write(u'\n') 1228 1229 def pi(self, target, data): 1230 if self._ignored_depth: 1231 return 1232 if self._root_done: 1233 self._write(u'\n') 1234 elif self._root_seen and self._data: 1235 self._flush() 1236 self._write( 1237 f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>') 1238 if not self._root_seen: 1239 self._write(u'\n') 1240 1241 def close(self): 1242 return None 1243 1244 1245 cdef _raise_serialization_error(text): 1246 raise TypeError("cannot serialize %r (type %s)" % (text, type(text).__name__)) 1247 1248 1249 cdef unicode _escape_cdata_c14n(stext): 1250 # escape character data 1251 cdef unicode text 1252 try: 1253 # it's worth avoiding do-nothing calls for strings that are 1254 # shorter than 500 character, or so. assume that's, by far, 1255 # the most common case in most applications. 1256 text = unicode(stext) 1257 if u'&' in text: 1258 text = text.replace(u'&', u'&') 1259 if u'<' in text: 1260 text = text.replace(u'<', u'<') 1261 if u'>' in text: 1262 text = text.replace(u'>', u'>') 1263 if u'\r' in text: 1264 text = text.replace(u'\r', u'
') 1265 return text 1266 except (TypeError, AttributeError): 1267 _raise_serialization_error(stext) 1268 1269 1270 cdef unicode _escape_attrib_c14n(stext): 1271 # escape attribute value 1272 cdef unicode text 1273 try: 1274 text = unicode(stext) 1275 if u'&' in text: 1276 text = text.replace(u'&', u'&') 1277 if u'<' in text: 1278 text = text.replace(u'<', u'<') 1279 if u'"' in text: 1280 text = text.replace(u'"', u'"') 1281 if u'\t' in text: 1282 text = text.replace(u'\t', u'	') 1283 if u'\n' in text: 1284 text = text.replace(u'\n', u'
') 1285 if u'\r' in text: 1286 text = text.replace(u'\r', u'
') 1287 return text 1288 except (TypeError, AttributeError): 1289 _raise_serialization_error(stext) 1290 1291 1292 # incremental serialisation 1293 1294 cdef class xmlfile: 1295 """xmlfile(self, output_file, encoding=None, compression=None, close=False, buffered=True) 1296 1297 A simple mechanism for incremental XML serialisation. 1298 1299 Usage example:: 1300 1301 with xmlfile("somefile.xml", encoding='utf-8') as xf: 1302 xf.write_declaration(standalone=True) 1303 xf.write_doctype('<!DOCTYPE root SYSTEM "some.dtd">') 1304 1305 # generate an element (the root element) 1306 with xf.element('root'): 1307 # write a complete Element into the open root element 1308 xf.write(etree.Element('test')) 1309 1310 # generate and write more Elements, e.g. through iterparse 1311 for element in generate_some_elements(): 1312 # serialise generated elements into the XML file 1313 xf.write(element) 1314 1315 # or write multiple Elements or strings at once 1316 xf.write(etree.Element('start'), "text", etree.Element('end')) 1317 1318 If 'output_file' is a file(-like) object, passing ``close=True`` will 1319 close it when exiting the context manager. By default, it is left 1320 to the owner to do that. When a file path is used, lxml will take care 1321 of opening and closing the file itself. Also, when a compression level 1322 is set, lxml will deliberately close the file to make sure all data gets 1323 compressed and written. 1324 1325 Setting ``buffered=False`` will flush the output after each operation, 1326 such as opening or closing an ``xf.element()`` block or calling 1327 ``xf.write()``. Alternatively, calling ``xf.flush()`` can be used to 1328 explicitly flush any pending output when buffering is enabled. 1329 """ 1330 cdef object output_file 1331 cdef bytes encoding 1332 cdef _IncrementalFileWriter writer 1333 cdef _AsyncIncrementalFileWriter async_writer 1334 cdef int compresslevel 1335 cdef bint close 1336 cdef bint buffered 1337 cdef int method 1338 1339 def __init__(self, output_file not None, encoding=None, compression=None, 1340 close=False, buffered=True): 1341 self.output_file = output_file 1342 self.encoding = _utf8orNone(encoding) 1343 self.compresslevel = compression or 0 1344 self.close = close 1345 self.buffered = buffered 1346 self.method = OUTPUT_METHOD_XML 1347 1348 def __enter__(self): 1349 assert self.output_file is not None 1350 self.writer = _IncrementalFileWriter( 1351 self.output_file, self.encoding, self.compresslevel, 1352 self.close, self.buffered, self.method) 1353 return self.writer 1354 1355 def __exit__(self, exc_type, exc_val, exc_tb): 1356 if self.writer is not None: 1357 old_writer, self.writer = self.writer, None 1358 raise_on_error = exc_type is None 1359 old_writer._close(raise_on_error) 1360 if self.close: 1361 self.output_file = None 1362 1363 async def __aenter__(self): 1364 assert self.output_file is not None 1365 if isinstance(self.output_file, basestring): 1366 raise TypeError("Cannot asynchronously write to a plain file") 1367 if not hasattr(self.output_file, 'write'): 1368 raise TypeError("Output file needs an async .write() method") 1369 self.async_writer = _AsyncIncrementalFileWriter( 1370 self.output_file, self.encoding, self.compresslevel, 1371 self.close, self.buffered, self.method) 1372 return self.async_writer 1373 1374 async def __aexit__(self, exc_type, exc_val, exc_tb): 1375 if self.async_writer is not None: 1376 old_writer, self.async_writer = self.async_writer, None 1377 raise_on_error = exc_type is None 1378 await old_writer._close(raise_on_error) 1379 if self.close: 1380 self.output_file = None 1381 1382 1383 cdef class htmlfile(xmlfile): 1384 """htmlfile(self, output_file, encoding=None, compression=None, close=False, buffered=True) 1385 1386 A simple mechanism for incremental HTML serialisation. Works the same as 1387 xmlfile. 1388 """ 1389 def __init__(self, *args, **kwargs): 1390 super().__init__(*args, **kwargs) 1391 self.method = OUTPUT_METHOD_HTML 1392 1393 1394 cdef enum _IncrementalFileWriterStatus: 1395 WRITER_STARTING = 0 1396 WRITER_DECL_WRITTEN = 1 1397 WRITER_DTD_WRITTEN = 2 1398 WRITER_IN_ELEMENT = 3 1399 WRITER_FINISHED = 4 1400 1401 1402 @cython.final 1403 @cython.internal 1404 cdef class _IncrementalFileWriter: 1405 cdef tree.xmlOutputBuffer* _c_out 1406 cdef bytes _encoding 1407 cdef const_char* _c_encoding 1408 cdef _FilelikeWriter _target 1409 cdef list _element_stack 1410 cdef int _status 1411 cdef int _method 1412 cdef bint _buffered 1413 1414 def __cinit__(self, outfile, bytes encoding, int compresslevel, bint close, 1415 bint buffered, int method): 1416 self._status = WRITER_STARTING 1417 self._element_stack = [] 1418 if encoding is None: 1419 encoding = b'ASCII' 1420 self._encoding = encoding 1421 self._c_encoding = _cstr(encoding) if encoding is not None else NULL 1422 self._buffered = buffered 1423 self._target = _create_output_buffer( 1424 outfile, self._c_encoding, compresslevel, &self._c_out, close) 1425 self._method = method 1426 1427 def __dealloc__(self): 1428 if self._c_out is not NULL: 1429 tree.xmlOutputBufferClose(self._c_out) 1430 1431 def write_declaration(self, version=None, standalone=None, doctype=None): 1432 """write_declaration(self, version=None, standalone=None, doctype=None) 1433 1434 Write an XML declaration and (optionally) a doctype into the file. 1435 """ 1436 assert self._c_out is not NULL 1437 cdef const_xmlChar* c_version 1438 cdef int c_standalone 1439 if self._method != OUTPUT_METHOD_XML: 1440 raise LxmlSyntaxError("only XML documents have declarations") 1441 if self._status >= WRITER_DECL_WRITTEN: 1442 raise LxmlSyntaxError("XML declaration already written") 1443 version = _utf8orNone(version) 1444 c_version = _xcstr(version) if version is not None else NULL 1445 doctype = _utf8orNone(doctype) 1446 if standalone is None: 1447 c_standalone = -1 1448 else: 1449 c_standalone = 1 if standalone else 0 1450 _writeDeclarationToBuffer(self._c_out, c_version, self._c_encoding, c_standalone) 1451 if doctype is not None: 1452 _writeDoctype(self._c_out, _xcstr(doctype)) 1453 self._status = WRITER_DTD_WRITTEN 1454 else: 1455 self._status = WRITER_DECL_WRITTEN 1456 if not self._buffered: 1457 tree.xmlOutputBufferFlush(self._c_out) 1458 self._handle_error(self._c_out.error) 1459 1460 def write_doctype(self, doctype): 1461 """write_doctype(self, doctype) 1462 1463 Writes the given doctype declaration verbatimly into the file. 1464 """ 1465 assert self._c_out is not NULL 1466 if doctype is None: 1467 return 1468 if self._status >= WRITER_DTD_WRITTEN: 1469 raise LxmlSyntaxError("DOCTYPE already written or cannot write it here") 1470 doctype = _utf8(doctype) 1471 _writeDoctype(self._c_out, _xcstr(doctype)) 1472 self._status = WRITER_DTD_WRITTEN 1473 if not self._buffered: 1474 tree.xmlOutputBufferFlush(self._c_out) 1475 self._handle_error(self._c_out.error) 1476 1477 def method(self, method): 1478 """method(self, method) 1479 1480 Returns a context manager that overrides and restores the output method. 1481 method is one of (None, 'xml', 'html') where None means 'xml'. 1482 """ 1483 assert self._c_out is not NULL 1484 c_method = self._method if method is None else _findOutputMethod(method) 1485 return _MethodChanger(self, c_method) 1486 1487 def element(self, tag, attrib=None, nsmap=None, method=None, **_extra): 1488 """element(self, tag, attrib=None, nsmap=None, method, **_extra) 1489 1490 Returns a context manager that writes an opening and closing tag. 1491 method is one of (None, 'xml', 'html') where None means 'xml'. 1492 """ 1493 assert self._c_out is not NULL 1494 attributes = [] 1495 if attrib is not None: 1496 for name, value in _iter_attrib(attrib): 1497 if name not in _extra: 1498 ns, name = _getNsTag(name) 1499 attributes.append((ns, name, _utf8(value))) 1500 if _extra: 1501 for name, value in _extra.iteritems(): 1502 ns, name = _getNsTag(name) 1503 attributes.append((ns, name, _utf8(value))) 1504 reversed_nsmap = {} 1505 if nsmap: 1506 for prefix, ns in nsmap.items(): 1507 if prefix is not None: 1508 prefix = _utf8(prefix) 1509 _prefixValidOrRaise(prefix) 1510 reversed_nsmap[_utf8(ns)] = prefix 1511 ns, name = _getNsTag(tag) 1512 1513 c_method = self._method if method is None else _findOutputMethod(method) 1514 1515 return _FileWriterElement(self, (ns, name, attributes, reversed_nsmap), c_method) 1516 1517 cdef _write_qname(self, bytes name, bytes prefix): 1518 if prefix: # empty bytes for no prefix (not None to allow sorting) 1519 tree.xmlOutputBufferWrite(self._c_out, len(prefix), _cstr(prefix)) 1520 tree.xmlOutputBufferWrite(self._c_out, 1, ':') 1521 tree.xmlOutputBufferWrite(self._c_out, len(name), _cstr(name)) 1522 1523 cdef _write_start_element(self, element_config): 1524 if self._status > WRITER_IN_ELEMENT: 1525 raise LxmlSyntaxError("cannot append trailing element to complete XML document") 1526 ns, name, attributes, nsmap = element_config 1527 flat_namespace_map, new_namespaces = self._collect_namespaces(nsmap) 1528 prefix = self._find_prefix(ns, flat_namespace_map, new_namespaces) 1529 tree.xmlOutputBufferWrite(self._c_out, 1, '<') 1530 self._write_qname(name, prefix) 1531 1532 self._write_attributes_and_namespaces( 1533 attributes, flat_namespace_map, new_namespaces) 1534 1535 tree.xmlOutputBufferWrite(self._c_out, 1, '>') 1536 if not self._buffered: 1537 tree.xmlOutputBufferFlush(self._c_out) 1538 self._handle_error(self._c_out.error) 1539 1540 self._element_stack.append((ns, name, prefix, flat_namespace_map)) 1541 self._status = WRITER_IN_ELEMENT 1542 1543 cdef _write_attributes_and_namespaces(self, list attributes, 1544 dict flat_namespace_map, 1545 list new_namespaces): 1546 if attributes: 1547 # _find_prefix() may append to new_namespaces => build them first 1548 attributes = [ 1549 (self._find_prefix(ns, flat_namespace_map, new_namespaces), name, value) 1550 for ns, name, value in attributes ] 1551 if new_namespaces: 1552 new_namespaces.sort() 1553 self._write_attributes_list(new_namespaces) 1554 if attributes: 1555 self._write_attributes_list(attributes) 1556 1557 cdef _write_attributes_list(self, list attributes): 1558 for prefix, name, value in attributes: 1559 tree.xmlOutputBufferWrite(self._c_out, 1, ' ') 1560 self._write_qname(name, prefix) 1561 tree.xmlOutputBufferWrite(self._c_out, 2, '="') 1562 _write_attr_string(self._c_out, _cstr(value)) 1563 1564 tree.xmlOutputBufferWrite(self._c_out, 1, '"') 1565 1566 cdef _write_end_element(self, element_config): 1567 if self._status != WRITER_IN_ELEMENT: 1568 raise LxmlSyntaxError("not in an element") 1569 if not self._element_stack or self._element_stack[-1][:2] != element_config[:2]: 1570 raise LxmlSyntaxError("inconsistent exit action in context manager") 1571 1572 # If previous write operations failed, the context manager exit might still call us. 1573 # That is ok, but we stop writing closing tags and handling errors in that case. 1574 # For all non-I/O errors, we continue writing closing tags if we can. 1575 ok_to_write = self._c_out.error == xmlerror.XML_ERR_OK 1576 1577 name, prefix = self._element_stack.pop()[1:3] 1578 if ok_to_write: 1579 tree.xmlOutputBufferWrite(self._c_out, 2, '</') 1580 self._write_qname(name, prefix) 1581 tree.xmlOutputBufferWrite(self._c_out, 1, '>') 1582 1583 if not self._element_stack: 1584 self._status = WRITER_FINISHED 1585 if ok_to_write: 1586 if not self._buffered: 1587 tree.xmlOutputBufferFlush(self._c_out) 1588 self._handle_error(self._c_out.error) 1589 1590 cdef _find_prefix(self, bytes href, dict flat_namespaces_map, list new_namespaces): 1591 if href is None: 1592 return None 1593 if href in flat_namespaces_map: 1594 return flat_namespaces_map[href] 1595 # need to create a new prefix 1596 prefixes = flat_namespaces_map.values() 1597 i = 0 1598 while True: 1599 prefix = _utf8('ns%d' % i) 1600 if prefix not in prefixes: 1601 new_namespaces.append((b'xmlns', prefix, href)) 1602 flat_namespaces_map[href] = prefix 1603 return prefix 1604 i += 1 1605 1606 cdef _collect_namespaces(self, dict nsmap): 1607 new_namespaces = [] 1608 flat_namespaces_map = {} 1609 for ns, prefix in nsmap.iteritems(): 1610 flat_namespaces_map[ns] = prefix 1611 if prefix is None: 1612 # use empty bytes rather than None to allow sorting 1613 new_namespaces.append((b'', b'xmlns', ns)) 1614 else: 1615 new_namespaces.append((b'xmlns', prefix, ns)) 1616 # merge in flat namespace map of parent 1617 if self._element_stack: 1618 for ns, prefix in (<dict>self._element_stack[-1][-1]).iteritems(): 1619 if flat_namespaces_map.get(ns) is None: 1620 # unknown or empty prefix => prefer a 'real' prefix 1621 flat_namespaces_map[ns] = prefix 1622 return flat_namespaces_map, new_namespaces 1623 1624 def write(self, *args, bint with_tail=True, bint pretty_print=False, method=None): 1625 """write(self, *args, with_tail=True, pretty_print=False, method=None) 1626 1627 Write subtrees or strings into the file. 1628 1629 If method is not None, it should be one of ('html', 'xml', 'text') 1630 to temporarily override the output method. 1631 """ 1632 assert self._c_out is not NULL 1633 c_method = self._method if method is None else _findOutputMethod(method) 1634 1635 for content in args: 1636 if _isString(content): 1637 if self._status != WRITER_IN_ELEMENT: 1638 if self._status > WRITER_IN_ELEMENT or content.strip(): 1639 raise LxmlSyntaxError("not in an element") 1640 bstring = _utf8(content) 1641 if not bstring: 1642 continue 1643 1644 ns, name, _, _ = self._element_stack[-1] 1645 if (c_method == OUTPUT_METHOD_HTML and 1646 ns in (None, b'http://www.w3.org/1999/xhtml') and 1647 name in (b'script', b'style')): 1648 tree.xmlOutputBufferWrite(self._c_out, len(bstring), _cstr(bstring)) 1649 1650 else: 1651 tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(bstring), NULL) 1652 1653 elif iselement(content): 1654 if self._status > WRITER_IN_ELEMENT: 1655 raise LxmlSyntaxError("cannot append trailing element to complete XML document") 1656 _writeNodeToBuffer(self._c_out, (<_Element>content)._c_node, 1657 self._c_encoding, NULL, c_method, 1658 False, False, pretty_print, with_tail, False) 1659 if (<_Element>content)._c_node.type == tree.XML_ELEMENT_NODE: 1660 if not self._element_stack: 1661 self._status = WRITER_FINISHED 1662 1663 elif content is not None: 1664 raise TypeError( 1665 f"got invalid input value of type {type(content)}, expected string or Element") 1666 self._handle_error(self._c_out.error) 1667 if not self._buffered: 1668 tree.xmlOutputBufferFlush(self._c_out) 1669 self._handle_error(self._c_out.error) 1670 1671 def flush(self): 1672 """flush(self) 1673 1674 Write any pending content of the current output buffer to the stream. 1675 """ 1676 assert self._c_out is not NULL 1677 tree.xmlOutputBufferFlush(self._c_out) 1678 self._handle_error(self._c_out.error) 1679 1680 cdef _close(self, bint raise_on_error): 1681 if raise_on_error: 1682 if self._status < WRITER_IN_ELEMENT: 1683 raise LxmlSyntaxError("no content written") 1684 if self._element_stack: 1685 raise LxmlSyntaxError("pending open tags on close") 1686 error_result = self._c_out.error 1687 if error_result == xmlerror.XML_ERR_OK: 1688 error_result = tree.xmlOutputBufferClose(self._c_out) 1689 if error_result != -1: 1690 error_result = xmlerror.XML_ERR_OK 1691 else: 1692 tree.xmlOutputBufferClose(self._c_out) 1693 self._status = WRITER_FINISHED 1694 self._c_out = NULL 1695 del self._element_stack[:] 1696 if raise_on_error: 1697 self._handle_error(error_result) 1698 1699 cdef _handle_error(self, int error_result): 1700 if error_result != xmlerror.XML_ERR_OK: 1701 if self._target is not None: 1702 self._target._exc_context._raise_if_stored() 1703 _raiseSerialisationError(error_result) 1704 1705 1706 @cython.final 1707 @cython.internal 1708 cdef class _AsyncDataWriter: 1709 cdef list _data 1710 def __cinit__(self): 1711 self._data = [] 1712 1713 cdef bytes collect(self): 1714 data = b''.join(self._data) 1715 del self._data[:] 1716 return data 1717 1718 def write(self, data): 1719 self._data.append(data) 1720 1721 def close(self): 1722 pass 1723 1724 1725 @cython.final 1726 @cython.internal 1727 cdef class _AsyncIncrementalFileWriter: 1728 cdef _IncrementalFileWriter _writer 1729 cdef _AsyncDataWriter _buffer 1730 cdef object _async_outfile 1731 cdef int _flush_after_writes 1732 cdef bint _should_close 1733 cdef bint _buffered 1734 1735 def __cinit__(self, async_outfile, bytes encoding, int compresslevel, bint close, 1736 bint buffered, int method): 1737 self._flush_after_writes = 20 1738 self._async_outfile = async_outfile 1739 self._should_close = close 1740 self._buffered = buffered 1741 self._buffer = _AsyncDataWriter() 1742 self._writer = _IncrementalFileWriter( 1743 self._buffer, encoding, compresslevel, close=True, buffered=False, method=method) 1744 1745 cdef bytes _flush(self): 1746 if not self._buffered or len(self._buffer._data) > self._flush_after_writes: 1747 return self._buffer.collect() 1748 return None 1749 1750 async def flush(self): 1751 self._writer.flush() 1752 data = self._buffer.collect() 1753 if data: 1754 await self._async_outfile.write(data) 1755 1756 async def write_declaration(self, version=None, standalone=None, doctype=None): 1757 self._writer.write_declaration(version, standalone, doctype) 1758 data = self._flush() 1759 if data: 1760 await self._async_outfile.write(data) 1761 1762 async def write_doctype(self, doctype): 1763 self._writer.write_doctype(doctype) 1764 data = self._flush() 1765 if data: 1766 await self._async_outfile.write(data) 1767 1768 async def write(self, *args, with_tail=True, pretty_print=False, method=None): 1769 self._writer.write(*args, with_tail=with_tail, pretty_print=pretty_print, method=method) 1770 data = self._flush() 1771 if data: 1772 await self._async_outfile.write(data) 1773 1774 def method(self, method): 1775 return self._writer.method(method) 1776 1777 def element(self, tag, attrib=None, nsmap=None, method=None, **_extra): 1778 element_writer = self._writer.element(tag, attrib, nsmap, method, **_extra) 1779 return _AsyncFileWriterElement(element_writer, self) 1780 1781 async def _close(self, bint raise_on_error): 1782 self._writer._close(raise_on_error) 1783 data = self._buffer.collect() 1784 if data: 1785 await self._async_outfile.write(data) 1786 if self._should_close: 1787 await self._async_outfile.close() 1788 1789 1790 @cython.final 1791 @cython.internal 1792 cdef class _AsyncFileWriterElement: 1793 cdef _FileWriterElement _element_writer 1794 cdef _AsyncIncrementalFileWriter _writer 1795 1796 def __cinit__(self, _FileWriterElement element_writer not None, 1797 _AsyncIncrementalFileWriter writer not None): 1798 self._element_writer = element_writer 1799 self._writer = writer 1800 1801 async def __aenter__(self): 1802 self._element_writer.__enter__() 1803 data = self._writer._flush() 1804 if data: 1805 await self._writer._async_outfile.write(data) 1806 1807 async def __aexit__(self, *args): 1808 self._element_writer.__exit__(*args) 1809 data = self._writer._flush() 1810 if data: 1811 await self._writer._async_outfile.write(data) 1812 1813 1814 @cython.final 1815 @cython.internal 1816 @cython.freelist(8) 1817 cdef class _FileWriterElement: 1818 cdef _IncrementalFileWriter _writer 1819 cdef object _element 1820 cdef int _new_method 1821 cdef int _old_method 1822 1823 def __cinit__(self, _IncrementalFileWriter writer not None, element_config, int method): 1824 self._writer = writer 1825 self._element = element_config 1826 self._new_method = method 1827 self._old_method = writer._method 1828 1829 def __enter__(self): 1830 self._writer._method = self._new_method 1831 self._writer._write_start_element(self._element) 1832 1833 def __exit__(self, exc_type, exc_val, exc_tb): 1834 self._writer._write_end_element(self._element) 1835 self._writer._method = self._old_method 1836 1837 1838 @cython.final 1839 @cython.internal 1840 @cython.freelist(8) 1841 cdef class _MethodChanger: 1842 cdef _IncrementalFileWriter _writer 1843 cdef int _new_method 1844 cdef int _old_method 1845 cdef bint _entered 1846 cdef bint _exited 1847 1848 def __cinit__(self, _IncrementalFileWriter writer not None, int method): 1849 self._writer = writer 1850 self._new_method = method 1851 self._old_method = writer._method 1852 self._entered = False 1853 self._exited = False 1854 1855 def __enter__(self): 1856 if self._entered: 1857 raise LxmlSyntaxError("Inconsistent enter action in context manager") 1858 self._writer._method = self._new_method 1859 self._entered = True 1860 1861 def __exit__(self, exc_type, exc_val, exc_tb): 1862 if self._exited: 1863 raise LxmlSyntaxError("Inconsistent exit action in context manager") 1864 if self._writer._method != self._new_method: 1865 raise LxmlSyntaxError("Method changed outside of context manager") 1866 self._writer._method = self._old_method 1867 self._exited = True 1868 1869 async def __aenter__(self): 1870 # for your async convenience 1871 return self.__enter__() 1872 1873 async def __aexit__(self, *args): 1874 # for your async convenience 1875 return self.__exit__(*args)