etree.pyx
1 # cython: binding=True 2 # cython: auto_pickle=False 3 # cython: language_level=2 4 5 """ 6 The ``lxml.etree`` module implements the extended ElementTree API for XML. 7 """ 8 9 from __future__ import absolute_import 10 11 __docformat__ = u"restructuredtext en" 12 13 __all__ = [ 14 'AttributeBasedElementClassLookup', 'C14NError', 'C14NWriterTarget', 'CDATA', 15 'Comment', 'CommentBase', 'CustomElementClassLookup', 'DEBUG', 16 'DTD', 'DTDError', 'DTDParseError', 'DTDValidateError', 17 'DocumentInvalid', 'ETCompatXMLParser', 'ETXPath', 'Element', 18 'ElementBase', 'ElementClassLookup', 'ElementDefaultClassLookup', 19 'ElementNamespaceClassLookup', 'ElementTree', 'Entity', 'EntityBase', 20 'Error', 'ErrorDomains', 'ErrorLevels', 'ErrorTypes', 'Extension', 21 'FallbackElementClassLookup', 'FunctionNamespace', 'HTML', 22 'HTMLParser', 'LIBXML_COMPILED_VERSION', 'LIBXML_VERSION', 23 'LIBXSLT_COMPILED_VERSION', 'LIBXSLT_VERSION', 'LXML_VERSION', 24 'LxmlError', 'LxmlRegistryError', 'LxmlSyntaxError', 25 'NamespaceRegistryError', 'PI', 'PIBase', 'ParseError', 26 'ParserBasedElementClassLookup', 'ParserError', 'ProcessingInstruction', 27 'PyErrorLog', 'PythonElementClassLookup', 'QName', 'RelaxNG', 28 'RelaxNGError', 'RelaxNGErrorTypes', 'RelaxNGParseError', 29 'RelaxNGValidateError', 'Resolver', 'Schematron', 'SchematronError', 30 'SchematronParseError', 'SchematronValidateError', 'SerialisationError', 31 'SubElement', 'TreeBuilder', 'XInclude', 'XIncludeError', 'XML', 32 'XMLDTDID', 'XMLID', 'XMLParser', 'XMLSchema', 'XMLSchemaError', 33 'XMLSchemaParseError', 'XMLSchemaValidateError', 'XMLSyntaxError', 34 'XMLTreeBuilder', 'XPath', 'XPathDocumentEvaluator', 'XPathError', 35 'XPathEvalError', 'XPathEvaluator', 'XPathFunctionError', 'XPathResultError', 36 'XPathSyntaxError', 'XSLT', 'XSLTAccessControl', 'XSLTApplyError', 37 'XSLTError', 'XSLTExtension', 'XSLTExtensionError', 'XSLTParseError', 38 'XSLTSaveError', 'canonicalize', 39 'cleanup_namespaces', 'clear_error_log', 'dump', 40 'fromstring', 'fromstringlist', 'get_default_parser', 'iselement', 41 'iterparse', 'iterwalk', 'parse', 'parseid', 'register_namespace', 42 'set_default_parser', 'set_element_class_lookup', 'strip_attributes', 43 'strip_elements', 'strip_tags', 'tostring', 'tostringlist', 'tounicode', 44 'use_global_python_log' 45 ] 46 47 cimport cython 48 49 from lxml cimport python 50 from lxml.includes cimport tree, config 51 from lxml.includes.tree cimport xmlDoc, xmlNode, xmlAttr, xmlNs, _isElement, _getNs 52 from lxml.includes.tree cimport const_xmlChar, xmlChar, _xcstr 53 from lxml.python cimport _cstr, _isString 54 from lxml.includes cimport xpath 55 from lxml.includes cimport c14n 56 57 # Cython's standard declarations 58 cimport cpython.mem 59 cimport cpython.ref 60 from libc cimport limits, stdio, stdlib 61 from libc cimport string as cstring_h # not to be confused with stdlib 'string' 62 from libc.string cimport const_char 63 64 cdef object os_path_abspath 65 from os.path import abspath as os_path_abspath 66 67 cdef object BytesIO, StringIO 68 from io import BytesIO, StringIO 69 70 cdef object OrderedDict 71 from collections import OrderedDict 72 73 cdef object _elementpath 74 from lxml import _elementpath 75 76 cdef object sys 77 import sys 78 79 cdef object re 80 import re 81 82 cdef object partial 83 from functools import partial 84 85 cdef object islice 86 from itertools import islice 87 88 cdef object ITER_EMPTY = iter(()) 89 90 cdef object MutableMapping 91 try: 92 from collections.abc import MutableMapping # Py3.3+ 93 except ImportError: 94 from collections import MutableMapping # Py2.7 95 96 class _ImmutableMapping(MutableMapping): 97 def __getitem__(self, key): 98 raise KeyError, key 99 100 def __setitem__(self, key, value): 101 raise KeyError, key 102 103 def __delitem__(self, key): 104 raise KeyError, key 105 106 def __contains__(self, key): 107 return False 108 109 def __len__(self): 110 return 0 111 112 def __iter__(self): 113 return ITER_EMPTY 114 iterkeys = itervalues = iteritems = __iter__ 115 116 cdef object IMMUTABLE_EMPTY_MAPPING = _ImmutableMapping() 117 del _ImmutableMapping 118 119 120 # the rules 121 # --------- 122 # any libxml C argument/variable is prefixed with c_ 123 # any non-public function/class is prefixed with an underscore 124 # instance creation is always through factories 125 126 # what to do with libxml2/libxslt error messages? 127 # 0 : drop 128 # 1 : use log 129 DEF __DEBUG = 1 130 131 # maximum number of lines in the libxml2/xslt log if __DEBUG == 1 132 DEF __MAX_LOG_SIZE = 100 133 134 # make the compiled-in debug state publicly available 135 DEBUG = __DEBUG 136 137 # A struct to store a cached qualified tag name+href pair. 138 # While we can borrow the c_name from the document dict, 139 # PyPy requires us to store a Python reference for the 140 # namespace in order to keep the byte buffer alive. 141 cdef struct qname: 142 const_xmlChar* c_name 143 python.PyObject* href 144 145 # global per-thread setup 146 tree.xmlThrDefIndentTreeOutput(1) 147 tree.xmlThrDefLineNumbersDefaultValue(1) 148 149 _initThreadLogging() 150 151 # initialize parser (and threading) 152 xmlparser.xmlInitParser() 153 154 # filename encoding 155 cdef bytes _FILENAME_ENCODING = (sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii').encode("UTF-8") 156 cdef char* _C_FILENAME_ENCODING = _cstr(_FILENAME_ENCODING) 157 158 # set up some default namespace prefixes 159 cdef dict _DEFAULT_NAMESPACE_PREFIXES = { 160 b"http://www.w3.org/XML/1998/namespace": b'xml', 161 b"http://www.w3.org/1999/xhtml": b"html", 162 b"http://www.w3.org/1999/XSL/Transform": b"xsl", 163 b"http://www.w3.org/1999/02/22-rdf-syntax-ns#": b"rdf", 164 b"http://schemas.xmlsoap.org/wsdl/": b"wsdl", 165 # xml schema 166 b"http://www.w3.org/2001/XMLSchema": b"xs", 167 b"http://www.w3.org/2001/XMLSchema-instance": b"xsi", 168 # dublin core 169 b"http://purl.org/dc/elements/1.1/": b"dc", 170 # objectify 171 b"http://codespeak.net/lxml/objectify/pytype" : b"py", 172 } 173 174 # To avoid runtime encoding overhead, we keep a Unicode copy 175 # of the uri-prefix mapping as (str, str) items view (list in Py2). 176 cdef object _DEFAULT_NAMESPACE_PREFIXES_ITEMS = [] 177 178 cdef _update_default_namespace_prefixes_items(): 179 cdef bytes ns, prefix 180 global _DEFAULT_NAMESPACE_PREFIXES_ITEMS 181 _DEFAULT_NAMESPACE_PREFIXES_ITEMS = { 182 ns.decode('utf-8') : prefix.decode('utf-8') 183 for ns, prefix in _DEFAULT_NAMESPACE_PREFIXES.items() 184 }.items() 185 186 _update_default_namespace_prefixes_items() 187 188 cdef object _check_internal_prefix = re.compile(b"ns\d+$").match 189 190 def register_namespace(prefix, uri): 191 u"""Registers a namespace prefix that newly created Elements in that 192 namespace will use. The registry is global, and any existing 193 mapping for either the given prefix or the namespace URI will be 194 removed. 195 """ 196 prefix_utf, uri_utf = _utf8(prefix), _utf8(uri) 197 if _check_internal_prefix(prefix_utf): 198 raise ValueError("Prefix format reserved for internal use") 199 _tagValidOrRaise(prefix_utf) 200 _uriValidOrRaise(uri_utf) 201 if (uri_utf == b"http://www.w3.org/XML/1998/namespace" and prefix_utf != b'xml' 202 or prefix_utf == b'xml' and uri_utf != b"http://www.w3.org/XML/1998/namespace"): 203 raise ValueError("Cannot change the 'xml' prefix of the XML namespace") 204 for k, v in list(_DEFAULT_NAMESPACE_PREFIXES.items()): 205 if k == uri_utf or v == prefix_utf: 206 del _DEFAULT_NAMESPACE_PREFIXES[k] 207 _DEFAULT_NAMESPACE_PREFIXES[uri_utf] = prefix_utf 208 _update_default_namespace_prefixes_items() 209 210 211 # Error superclass for ElementTree compatibility 212 cdef class Error(Exception): 213 pass 214 215 # module level superclass for all exceptions 216 cdef class LxmlError(Error): 217 """Main exception base class for lxml. All other exceptions inherit from 218 this one. 219 """ 220 def __init__(self, message, error_log=None): 221 super(_Error, self).__init__(message) 222 if error_log is None: 223 self.error_log = __copyGlobalErrorLog() 224 else: 225 self.error_log = error_log.copy() 226 227 cdef object _Error = Error 228 229 230 # superclass for all syntax errors 231 class LxmlSyntaxError(LxmlError, SyntaxError): 232 """Base class for all syntax errors. 233 """ 234 235 cdef class C14NError(LxmlError): 236 """Error during C14N serialisation. 237 """ 238 239 # version information 240 cdef __unpackDottedVersion(version): 241 version_list = [] 242 l = (version.decode("ascii").replace(u'-', u'.').split(u'.') + [0]*4)[:4] 243 for item in l: 244 try: 245 item = int(item) 246 except ValueError: 247 if item.startswith(u'dev'): 248 count = item[3:] 249 item = -300 250 elif item.startswith(u'alpha'): 251 count = item[5:] 252 item = -200 253 elif item.startswith(u'beta'): 254 count = item[4:] 255 item = -100 256 else: 257 count = 0 258 if count: 259 item += int(count) 260 version_list.append(item) 261 return tuple(version_list) 262 263 cdef __unpackIntVersion(int c_version): 264 return ( 265 ((c_version / (100*100)) % 100), 266 ((c_version / 100) % 100), 267 (c_version % 100) 268 ) 269 270 cdef int _LIBXML_VERSION_INT 271 try: 272 _LIBXML_VERSION_INT = int( 273 re.match(u'[0-9]+', (<unsigned char*>tree.xmlParserVersion).decode("ascii")).group(0)) 274 except Exception: 275 print u"Unknown libxml2 version: %s" % (<unsigned char*>tree.xmlParserVersion).decode("latin1") 276 _LIBXML_VERSION_INT = 0 277 278 LIBXML_VERSION = __unpackIntVersion(_LIBXML_VERSION_INT) 279 LIBXML_COMPILED_VERSION = __unpackIntVersion(tree.LIBXML_VERSION) 280 LXML_VERSION = __unpackDottedVersion(tree.LXML_VERSION_STRING) 281 282 __version__ = tree.LXML_VERSION_STRING.decode("ascii") 283 284 285 # class for temporary storage of Python references, 286 # used e.g. for XPath results 287 @cython.final 288 @cython.internal 289 cdef class _TempStore: 290 cdef list _storage 291 def __init__(self): 292 self._storage = [] 293 294 cdef int add(self, obj) except -1: 295 self._storage.append(obj) 296 return 0 297 298 cdef int clear(self) except -1: 299 del self._storage[:] 300 return 0 301 302 303 # class for temporarily storing exceptions raised in extensions 304 @cython.internal 305 cdef class _ExceptionContext: 306 cdef object _exc_info 307 cdef int clear(self) except -1: 308 self._exc_info = None 309 return 0 310 311 cdef void _store_raised(self): 312 try: 313 self._exc_info = sys.exc_info() 314 except BaseException as e: 315 self._store_exception(e) 316 finally: 317 return # and swallow any further exceptions 318 319 cdef int _store_exception(self, exception) except -1: 320 self._exc_info = (exception, None, None) 321 return 0 322 323 cdef bint _has_raised(self) except -1: 324 return self._exc_info is not None 325 326 cdef int _raise_if_stored(self) except -1: 327 if self._exc_info is None: 328 return 0 329 type, value, traceback = self._exc_info 330 self._exc_info = None 331 if value is None and traceback is None: 332 raise type 333 else: 334 raise type, value, traceback 335 336 337 # type of a function that steps from node to node 338 ctypedef public xmlNode* (*_node_to_node_function)(xmlNode*) 339 340 341 ################################################################################ 342 # Include submodules 343 344 include "proxy.pxi" # Proxy handling (element backpointers/memory/etc.) 345 include "apihelpers.pxi" # Private helper functions 346 include "xmlerror.pxi" # Error and log handling 347 348 349 ################################################################################ 350 # Public Python API 351 352 @cython.final 353 @cython.freelist(8) 354 cdef public class _Document [ type LxmlDocumentType, object LxmlDocument ]: 355 u"""Internal base class to reference a libxml document. 356 357 When instances of this class are garbage collected, the libxml 358 document is cleaned up. 359 """ 360 cdef int _ns_counter 361 cdef bytes _prefix_tail 362 cdef xmlDoc* _c_doc 363 cdef _BaseParser _parser 364 365 def __dealloc__(self): 366 # if there are no more references to the document, it is safe 367 # to clean the whole thing up, as all nodes have a reference to 368 # the document 369 tree.xmlFreeDoc(self._c_doc) 370 371 @cython.final 372 cdef getroot(self): 373 # return an element proxy for the document root 374 cdef xmlNode* c_node 375 c_node = tree.xmlDocGetRootElement(self._c_doc) 376 if c_node is NULL: 377 return None 378 return _elementFactory(self, c_node) 379 380 @cython.final 381 cdef bint hasdoctype(self): 382 # DOCTYPE gets parsed into internal subset (xmlDTD*) 383 return self._c_doc is not NULL and self._c_doc.intSubset is not NULL 384 385 @cython.final 386 cdef getdoctype(self): 387 # get doctype info: root tag, public/system ID (or None if not known) 388 cdef tree.xmlDtd* c_dtd 389 cdef xmlNode* c_root_node 390 public_id = None 391 sys_url = None 392 c_dtd = self._c_doc.intSubset 393 if c_dtd is not NULL: 394 if c_dtd.ExternalID is not NULL: 395 public_id = funicode(c_dtd.ExternalID) 396 if c_dtd.SystemID is not NULL: 397 sys_url = funicode(c_dtd.SystemID) 398 c_dtd = self._c_doc.extSubset 399 if c_dtd is not NULL: 400 if not public_id and c_dtd.ExternalID is not NULL: 401 public_id = funicode(c_dtd.ExternalID) 402 if not sys_url and c_dtd.SystemID is not NULL: 403 sys_url = funicode(c_dtd.SystemID) 404 c_root_node = tree.xmlDocGetRootElement(self._c_doc) 405 if c_root_node is NULL: 406 root_name = None 407 else: 408 root_name = funicode(c_root_node.name) 409 return root_name, public_id, sys_url 410 411 @cython.final 412 cdef getxmlinfo(self): 413 # return XML version and encoding (or None if not known) 414 cdef xmlDoc* c_doc = self._c_doc 415 if c_doc.version is NULL: 416 version = None 417 else: 418 version = funicode(c_doc.version) 419 if c_doc.encoding is NULL: 420 encoding = None 421 else: 422 encoding = funicode(c_doc.encoding) 423 return version, encoding 424 425 @cython.final 426 cdef isstandalone(self): 427 # returns True for "standalone=true", 428 # False for "standalone=false", None if not provided 429 if self._c_doc.standalone == -1: 430 return None 431 else: 432 return <bint>(self._c_doc.standalone == 1) 433 434 @cython.final 435 cdef bytes buildNewPrefix(self): 436 # get a new unique prefix ("nsX") for this document 437 cdef bytes ns 438 if self._ns_counter < len(_PREFIX_CACHE): 439 ns = _PREFIX_CACHE[self._ns_counter] 440 else: 441 ns = python.PyBytes_FromFormat("ns%d", self._ns_counter) 442 if self._prefix_tail is not None: 443 ns += self._prefix_tail 444 self._ns_counter += 1 445 if self._ns_counter < 0: 446 # overflow! 447 self._ns_counter = 0 448 if self._prefix_tail is None: 449 self._prefix_tail = b"A" 450 else: 451 self._prefix_tail += b"A" 452 return ns 453 454 @cython.final 455 cdef xmlNs* _findOrBuildNodeNs(self, xmlNode* c_node, 456 const_xmlChar* c_href, const_xmlChar* c_prefix, 457 bint is_attribute) except NULL: 458 u"""Get or create namespace structure for a node. Reuses the prefix if 459 possible. 460 """ 461 cdef xmlNs* c_ns 462 cdef xmlNs* c_doc_ns 463 cdef python.PyObject* dict_result 464 if c_node.type != tree.XML_ELEMENT_NODE: 465 assert c_node.type == tree.XML_ELEMENT_NODE, \ 466 u"invalid node type %d, expected %d" % ( 467 c_node.type, tree.XML_ELEMENT_NODE) 468 # look for existing ns declaration 469 c_ns = _searchNsByHref(c_node, c_href, is_attribute) 470 if c_ns is not NULL: 471 if is_attribute and c_ns.prefix is NULL: 472 # do not put namespaced attributes into the default 473 # namespace as this would break serialisation 474 pass 475 else: 476 return c_ns 477 478 # none found => determine a suitable new prefix 479 if c_prefix is NULL: 480 dict_result = python.PyDict_GetItem( 481 _DEFAULT_NAMESPACE_PREFIXES, <unsigned char*>c_href) 482 if dict_result is not NULL: 483 prefix = <object>dict_result 484 else: 485 prefix = self.buildNewPrefix() 486 c_prefix = _xcstr(prefix) 487 488 # make sure the prefix is not in use already 489 while tree.xmlSearchNs(self._c_doc, c_node, c_prefix) is not NULL: 490 prefix = self.buildNewPrefix() 491 c_prefix = _xcstr(prefix) 492 493 # declare the namespace and return it 494 c_ns = tree.xmlNewNs(c_node, c_href, c_prefix) 495 if c_ns is NULL: 496 raise MemoryError() 497 return c_ns 498 499 @cython.final 500 cdef int _setNodeNs(self, xmlNode* c_node, const_xmlChar* c_href) except -1: 501 u"Lookup namespace structure and set it for the node." 502 c_ns = self._findOrBuildNodeNs(c_node, c_href, NULL, 0) 503 tree.xmlSetNs(c_node, c_ns) 504 505 cdef tuple __initPrefixCache(): 506 cdef int i 507 return tuple([ python.PyBytes_FromFormat("ns%d", i) 508 for i in range(30) ]) 509 510 cdef tuple _PREFIX_CACHE = __initPrefixCache() 511 512 cdef _Document _documentFactory(xmlDoc* c_doc, _BaseParser parser): 513 cdef _Document result 514 result = _Document.__new__(_Document) 515 result._c_doc = c_doc 516 result._ns_counter = 0 517 result._prefix_tail = None 518 if parser is None: 519 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() 520 result._parser = parser 521 return result 522 523 524 cdef object _find_invalid_public_id_characters = re.compile( 525 ur"[^\x20\x0D\x0Aa-zA-Z0-9'()+,./:=?;!*#@$_%-]+").search 526 527 528 cdef class DocInfo: 529 u"Document information provided by parser and DTD." 530 cdef _Document _doc 531 def __cinit__(self, tree): 532 u"Create a DocInfo object for an ElementTree object or root Element." 533 self._doc = _documentOrRaise(tree) 534 root_name, public_id, system_url = self._doc.getdoctype() 535 if not root_name and (public_id or system_url): 536 raise ValueError, u"Could not find root node" 537 538 @property 539 def root_name(self): 540 """Returns the name of the root node as defined by the DOCTYPE.""" 541 root_name, public_id, system_url = self._doc.getdoctype() 542 return root_name 543 544 @cython.final 545 cdef tree.xmlDtd* _get_c_dtd(self): 546 """"Return the DTD. Create it if it does not yet exist.""" 547 cdef xmlDoc* c_doc = self._doc._c_doc 548 cdef xmlNode* c_root_node 549 cdef const_xmlChar* c_name 550 551 if c_doc.intSubset: 552 return c_doc.intSubset 553 554 c_root_node = tree.xmlDocGetRootElement(c_doc) 555 c_name = c_root_node.name if c_root_node else NULL 556 return tree.xmlCreateIntSubset(c_doc, c_name, NULL, NULL) 557 558 def clear(self): 559 u"""Removes DOCTYPE and internal subset from the document.""" 560 cdef xmlDoc* c_doc = self._doc._c_doc 561 cdef tree.xmlNode* c_dtd = <xmlNode*>c_doc.intSubset 562 if c_dtd is NULL: 563 return 564 tree.xmlUnlinkNode(c_dtd) 565 tree.xmlFreeNode(c_dtd) 566 567 property public_id: 568 u"""Public ID of the DOCTYPE. 569 570 Mutable. May be set to a valid string or None. If a DTD does not 571 exist, setting this variable (even to None) will create one. 572 """ 573 def __get__(self): 574 root_name, public_id, system_url = self._doc.getdoctype() 575 return public_id 576 577 def __set__(self, value): 578 cdef xmlChar* c_value = NULL 579 if value is not None: 580 match = _find_invalid_public_id_characters(value) 581 if match: 582 raise ValueError, f'Invalid character(s) {match.group(0)!r} in public_id.' 583 value = _utf8(value) 584 c_value = tree.xmlStrdup(_xcstr(value)) 585 if not c_value: 586 raise MemoryError() 587 588 c_dtd = self._get_c_dtd() 589 if not c_dtd: 590 tree.xmlFree(c_value) 591 raise MemoryError() 592 if c_dtd.ExternalID: 593 tree.xmlFree(<void*>c_dtd.ExternalID) 594 c_dtd.ExternalID = c_value 595 596 property system_url: 597 u"""System ID of the DOCTYPE. 598 599 Mutable. May be set to a valid string or None. If a DTD does not 600 exist, setting this variable (even to None) will create one. 601 """ 602 def __get__(self): 603 root_name, public_id, system_url = self._doc.getdoctype() 604 return system_url 605 606 def __set__(self, value): 607 cdef xmlChar* c_value = NULL 608 if value is not None: 609 bvalue = _utf8(value) 610 # sys_url may be any valid unicode string that can be 611 # enclosed in single quotes or quotes. 612 if b"'" in bvalue and b'"' in bvalue: 613 raise ValueError( 614 'System URL may not contain both single (\') and double quotes (").') 615 c_value = tree.xmlStrdup(_xcstr(bvalue)) 616 if not c_value: 617 raise MemoryError() 618 619 c_dtd = self._get_c_dtd() 620 if not c_dtd: 621 tree.xmlFree(c_value) 622 raise MemoryError() 623 if c_dtd.SystemID: 624 tree.xmlFree(<void*>c_dtd.SystemID) 625 c_dtd.SystemID = c_value 626 627 @property 628 def xml_version(self): 629 """Returns the XML version as declared by the document.""" 630 xml_version, encoding = self._doc.getxmlinfo() 631 return xml_version 632 633 @property 634 def encoding(self): 635 """Returns the encoding name as declared by the document.""" 636 xml_version, encoding = self._doc.getxmlinfo() 637 return encoding 638 639 @property 640 def standalone(self): 641 """Returns the standalone flag as declared by the document. The possible 642 values are True (``standalone='yes'``), False 643 (``standalone='no'`` or flag not provided in the declaration), 644 and None (unknown or no declaration found). Note that a 645 normal truth test on this value will always tell if the 646 ``standalone`` flag was set to ``'yes'`` or not. 647 """ 648 return self._doc.isstandalone() 649 650 property URL: 651 u"The source URL of the document (or None if unknown)." 652 def __get__(self): 653 if self._doc._c_doc.URL is NULL: 654 return None 655 return _decodeFilename(self._doc._c_doc.URL) 656 def __set__(self, url): 657 url = _encodeFilename(url) 658 c_oldurl = self._doc._c_doc.URL 659 if url is None: 660 self._doc._c_doc.URL = NULL 661 else: 662 self._doc._c_doc.URL = tree.xmlStrdup(_xcstr(url)) 663 if c_oldurl is not NULL: 664 tree.xmlFree(<void*>c_oldurl) 665 666 @property 667 def doctype(self): 668 """Returns a DOCTYPE declaration string for the document.""" 669 root_name, public_id, system_url = self._doc.getdoctype() 670 if system_url: 671 # If '"' in system_url, we must escape it with single 672 # quotes, otherwise escape with double quotes. If url 673 # contains both a single quote and a double quote, XML 674 # standard is being violated. 675 if '"' in system_url: 676 quoted_system_url = f"'{system_url}'" 677 else: 678 quoted_system_url = f'"{system_url}"' 679 if public_id: 680 if system_url: 681 return f'<!DOCTYPE {root_name} PUBLIC "{public_id}" {quoted_system_url}>' 682 else: 683 return f'<!DOCTYPE {root_name} PUBLIC "{public_id}">' 684 elif system_url: 685 return f'<!DOCTYPE {root_name} SYSTEM {quoted_system_url}>' 686 elif self._doc.hasdoctype(): 687 return f'<!DOCTYPE {root_name}>' 688 else: 689 return u'' 690 691 @property 692 def internalDTD(self): 693 """Returns a DTD validator based on the internal subset of the document.""" 694 return _dtdFactory(self._doc._c_doc.intSubset) 695 696 @property 697 def externalDTD(self): 698 """Returns a DTD validator based on the external subset of the document.""" 699 return _dtdFactory(self._doc._c_doc.extSubset) 700 701 702 @cython.no_gc_clear 703 cdef public class _Element [ type LxmlElementType, object LxmlElement ]: 704 u"""Element class. 705 706 References a document object and a libxml node. 707 708 By pointing to a Document instance, a reference is kept to 709 _Document as long as there is some pointer to a node in it. 710 """ 711 cdef _Document _doc 712 cdef xmlNode* _c_node 713 cdef object _tag 714 715 def _init(self): 716 u"""_init(self) 717 718 Called after object initialisation. Custom subclasses may override 719 this if they recursively call _init() in the superclasses. 720 """ 721 722 @cython.linetrace(False) 723 @cython.profile(False) 724 def __dealloc__(self): 725 #print "trying to free node:", <int>self._c_node 726 #displayNode(self._c_node, 0) 727 if self._c_node is not NULL: 728 _unregisterProxy(self) 729 attemptDeallocation(self._c_node) 730 731 # MANIPULATORS 732 733 def __setitem__(self, x, value): 734 u"""__setitem__(self, x, value) 735 736 Replaces the given subelement index or slice. 737 """ 738 cdef xmlNode* c_node = NULL 739 cdef xmlNode* c_next 740 cdef xmlDoc* c_source_doc 741 cdef _Element element 742 cdef bint left_to_right 743 cdef Py_ssize_t slicelength = 0, step = 0 744 _assertValidNode(self) 745 if value is None: 746 raise ValueError, u"cannot assign None" 747 if isinstance(x, slice): 748 # slice assignment 749 _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength) 750 if step > 0: 751 left_to_right = 1 752 else: 753 left_to_right = 0 754 step = -step 755 _replaceSlice(self, c_node, slicelength, step, left_to_right, value) 756 return 757 else: 758 # otherwise: normal item assignment 759 element = value 760 _assertValidNode(element) 761 c_node = _findChild(self._c_node, x) 762 if c_node is NULL: 763 raise IndexError, u"list index out of range" 764 c_source_doc = element._c_node.doc 765 c_next = element._c_node.next 766 _removeText(c_node.next) 767 tree.xmlReplaceNode(c_node, element._c_node) 768 _moveTail(c_next, element._c_node) 769 moveNodeToDocument(self._doc, c_source_doc, element._c_node) 770 if not attemptDeallocation(c_node): 771 moveNodeToDocument(self._doc, c_node.doc, c_node) 772 773 def __delitem__(self, x): 774 u"""__delitem__(self, x) 775 776 Deletes the given subelement or a slice. 777 """ 778 cdef xmlNode* c_node = NULL 779 cdef xmlNode* c_next 780 cdef Py_ssize_t step = 0, slicelength = 0 781 _assertValidNode(self) 782 if isinstance(x, slice): 783 # slice deletion 784 if _isFullSlice(<slice>x): 785 c_node = self._c_node.children 786 if c_node is not NULL: 787 if not _isElement(c_node): 788 c_node = _nextElement(c_node) 789 while c_node is not NULL: 790 c_next = _nextElement(c_node) 791 _removeNode(self._doc, c_node) 792 c_node = c_next 793 else: 794 _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength) 795 _deleteSlice(self._doc, c_node, slicelength, step) 796 else: 797 # item deletion 798 c_node = _findChild(self._c_node, x) 799 if c_node is NULL: 800 raise IndexError, f"index out of range: {x}" 801 _removeNode(self._doc, c_node) 802 803 def __deepcopy__(self, memo): 804 u"__deepcopy__(self, memo)" 805 return self.__copy__() 806 807 def __copy__(self): 808 u"__copy__(self)" 809 cdef xmlDoc* c_doc 810 cdef xmlNode* c_node 811 cdef _Document new_doc 812 _assertValidNode(self) 813 c_doc = _copyDocRoot(self._doc._c_doc, self._c_node) # recursive 814 new_doc = _documentFactory(c_doc, self._doc._parser) 815 root = new_doc.getroot() 816 if root is not None: 817 return root 818 # Comment/PI 819 c_node = c_doc.children 820 while c_node is not NULL and c_node.type != self._c_node.type: 821 c_node = c_node.next 822 if c_node is NULL: 823 return None 824 return _elementFactory(new_doc, c_node) 825 826 def set(self, key, value): 827 u"""set(self, key, value) 828 829 Sets an element attribute. 830 In HTML documents (not XML or XHTML), the value None is allowed and creates 831 an attribute without value (just the attribute name). 832 """ 833 _assertValidNode(self) 834 _setAttributeValue(self, key, value) 835 836 def append(self, _Element element not None): 837 u"""append(self, element) 838 839 Adds a subelement to the end of this element. 840 """ 841 _assertValidNode(self) 842 _assertValidNode(element) 843 _appendChild(self, element) 844 845 def addnext(self, _Element element not None): 846 u"""addnext(self, element) 847 848 Adds the element as a following sibling directly after this 849 element. 850 851 This is normally used to set a processing instruction or comment after 852 the root node of a document. Note that tail text is automatically 853 discarded when adding at the root level. 854 """ 855 _assertValidNode(self) 856 _assertValidNode(element) 857 if self._c_node.parent != NULL and not _isElement(self._c_node.parent): 858 if element._c_node.type != tree.XML_PI_NODE: 859 if element._c_node.type != tree.XML_COMMENT_NODE: 860 raise TypeError, u"Only processing instructions and comments can be siblings of the root element" 861 element.tail = None 862 _appendSibling(self, element) 863 864 def addprevious(self, _Element element not None): 865 u"""addprevious(self, element) 866 867 Adds the element as a preceding sibling directly before this 868 element. 869 870 This is normally used to set a processing instruction or comment 871 before the root node of a document. Note that tail text is 872 automatically discarded when adding at the root level. 873 """ 874 _assertValidNode(self) 875 _assertValidNode(element) 876 if self._c_node.parent != NULL and not _isElement(self._c_node.parent): 877 if element._c_node.type != tree.XML_PI_NODE: 878 if element._c_node.type != tree.XML_COMMENT_NODE: 879 raise TypeError, u"Only processing instructions and comments can be siblings of the root element" 880 element.tail = None 881 _prependSibling(self, element) 882 883 def extend(self, elements): 884 u"""extend(self, elements) 885 886 Extends the current children by the elements in the iterable. 887 """ 888 cdef _Element element 889 _assertValidNode(self) 890 for element in elements: 891 if element is None: 892 raise TypeError, u"Node must not be None" 893 _assertValidNode(element) 894 _appendChild(self, element) 895 896 def clear(self, bint keep_tail=False): 897 u"""clear(self, keep_tail=False) 898 899 Resets an element. This function removes all subelements, clears 900 all attributes and sets the text and tail properties to None. 901 902 Pass ``keep_tail=True`` to leave the tail text untouched. 903 """ 904 cdef xmlAttr* c_attr 905 cdef xmlAttr* c_attr_next 906 cdef xmlNode* c_node 907 cdef xmlNode* c_node_next 908 _assertValidNode(self) 909 c_node = self._c_node 910 # remove self.text and self.tail 911 _removeText(c_node.children) 912 if not keep_tail: 913 _removeText(c_node.next) 914 # remove all attributes 915 c_attr = c_node.properties 916 if c_attr: 917 c_node.properties = NULL 918 tree.xmlFreePropList(c_attr) 919 # remove all subelements 920 c_node = c_node.children 921 if c_node and not _isElement(c_node): 922 c_node = _nextElement(c_node) 923 while c_node is not NULL: 924 c_node_next = _nextElement(c_node) 925 _removeNode(self._doc, c_node) 926 c_node = c_node_next 927 928 def insert(self, index: int, _Element element not None): 929 u"""insert(self, index, element) 930 931 Inserts a subelement at the given position in this element 932 """ 933 cdef xmlNode* c_node 934 cdef xmlNode* c_next 935 cdef xmlDoc* c_source_doc 936 _assertValidNode(self) 937 _assertValidNode(element) 938 c_node = _findChild(self._c_node, index) 939 if c_node is NULL: 940 _appendChild(self, element) 941 return 942 c_source_doc = element._c_node.doc 943 c_next = element._c_node.next 944 tree.xmlAddPrevSibling(c_node, element._c_node) 945 _moveTail(c_next, element._c_node) 946 moveNodeToDocument(self._doc, c_source_doc, element._c_node) 947 948 def remove(self, _Element element not None): 949 u"""remove(self, element) 950 951 Removes a matching subelement. Unlike the find methods, this 952 method compares elements based on identity, not on tag value 953 or contents. 954 """ 955 cdef xmlNode* c_node 956 cdef xmlNode* c_next 957 _assertValidNode(self) 958 _assertValidNode(element) 959 c_node = element._c_node 960 if c_node.parent is not self._c_node: 961 raise ValueError, u"Element is not a child of this node." 962 c_next = element._c_node.next 963 tree.xmlUnlinkNode(c_node) 964 _moveTail(c_next, c_node) 965 # fix namespace declarations 966 moveNodeToDocument(self._doc, c_node.doc, c_node) 967 968 def replace(self, _Element old_element not None, 969 _Element new_element not None): 970 u"""replace(self, old_element, new_element) 971 972 Replaces a subelement with the element passed as second argument. 973 """ 974 cdef xmlNode* c_old_node 975 cdef xmlNode* c_old_next 976 cdef xmlNode* c_new_node 977 cdef xmlNode* c_new_next 978 cdef xmlDoc* c_source_doc 979 _assertValidNode(self) 980 _assertValidNode(old_element) 981 _assertValidNode(new_element) 982 c_old_node = old_element._c_node 983 if c_old_node.parent is not self._c_node: 984 raise ValueError, u"Element is not a child of this node." 985 c_old_next = c_old_node.next 986 c_new_node = new_element._c_node 987 c_new_next = c_new_node.next 988 c_source_doc = c_new_node.doc 989 tree.xmlReplaceNode(c_old_node, c_new_node) 990 _moveTail(c_new_next, c_new_node) 991 _moveTail(c_old_next, c_old_node) 992 moveNodeToDocument(self._doc, c_source_doc, c_new_node) 993 # fix namespace declarations 994 moveNodeToDocument(self._doc, c_old_node.doc, c_old_node) 995 996 # PROPERTIES 997 property tag: 998 u"""Element tag 999 """ 1000 def __get__(self): 1001 if self._tag is not None: 1002 return self._tag 1003 _assertValidNode(self) 1004 self._tag = _namespacedName(self._c_node) 1005 return self._tag 1006 1007 def __set__(self, value): 1008 cdef _BaseParser parser 1009 _assertValidNode(self) 1010 ns, name = _getNsTag(value) 1011 parser = self._doc._parser 1012 if parser is not None and parser._for_html: 1013 _htmlTagValidOrRaise(name) 1014 else: 1015 _tagValidOrRaise(name) 1016 self._tag = value 1017 tree.xmlNodeSetName(self._c_node, _xcstr(name)) 1018 if ns is None: 1019 self._c_node.ns = NULL 1020 else: 1021 self._doc._setNodeNs(self._c_node, _xcstr(ns)) 1022 1023 @property 1024 def attrib(self): 1025 """Element attribute dictionary. Where possible, use get(), set(), 1026 keys(), values() and items() to access element attributes. 1027 """ 1028 return _Attrib.__new__(_Attrib, self) 1029 1030 property text: 1031 u"""Text before the first subelement. This is either a string or 1032 the value None, if there was no text. 1033 """ 1034 def __get__(self): 1035 _assertValidNode(self) 1036 return _collectText(self._c_node.children) 1037 1038 def __set__(self, value): 1039 _assertValidNode(self) 1040 if isinstance(value, QName): 1041 value = _resolveQNameText(self, value).decode('utf8') 1042 _setNodeText(self._c_node, value) 1043 1044 # using 'del el.text' is the wrong thing to do 1045 #def __del__(self): 1046 # _setNodeText(self._c_node, None) 1047 1048 property tail: 1049 u"""Text after this element's end tag, but before the next sibling 1050 element's start tag. This is either a string or the value None, if 1051 there was no text. 1052 """ 1053 def __get__(self): 1054 _assertValidNode(self) 1055 return _collectText(self._c_node.next) 1056 1057 def __set__(self, value): 1058 _assertValidNode(self) 1059 _setTailText(self._c_node, value) 1060 1061 # using 'del el.tail' is the wrong thing to do 1062 #def __del__(self): 1063 # _setTailText(self._c_node, None) 1064 1065 # not in ElementTree, read-only 1066 @property 1067 def prefix(self): 1068 """Namespace prefix or None. 1069 """ 1070 if self._c_node.ns is not NULL: 1071 if self._c_node.ns.prefix is not NULL: 1072 return funicode(self._c_node.ns.prefix) 1073 return None 1074 1075 # not in ElementTree, read-only 1076 property sourceline: 1077 u"""Original line number as found by the parser or None if unknown. 1078 """ 1079 def __get__(self): 1080 cdef long line 1081 _assertValidNode(self) 1082 line = tree.xmlGetLineNo(self._c_node) 1083 return line if line > 0 else None 1084 1085 def __set__(self, line): 1086 _assertValidNode(self) 1087 if line <= 0: 1088 self._c_node.line = 0 1089 else: 1090 self._c_node.line = line 1091 1092 # not in ElementTree, read-only 1093 @property 1094 def nsmap(self): 1095 """Namespace prefix->URI mapping known in the context of this 1096 Element. This includes all namespace declarations of the 1097 parents. 1098 1099 Note that changing the returned dict has no effect on the Element. 1100 """ 1101 _assertValidNode(self) 1102 return _build_nsmap(self._c_node) 1103 1104 # not in ElementTree, read-only 1105 property base: 1106 u"""The base URI of the Element (xml:base or HTML base URL). 1107 None if the base URI is unknown. 1108 1109 Note that the value depends on the URL of the document that 1110 holds the Element if there is no xml:base attribute on the 1111 Element or its ancestors. 1112 1113 Setting this property will set an xml:base attribute on the 1114 Element, regardless of the document type (XML or HTML). 1115 """ 1116 def __get__(self): 1117 _assertValidNode(self) 1118 c_base = tree.xmlNodeGetBase(self._doc._c_doc, self._c_node) 1119 if c_base is NULL: 1120 if self._doc._c_doc.URL is NULL: 1121 return None 1122 return _decodeFilename(self._doc._c_doc.URL) 1123 try: 1124 base = _decodeFilename(c_base) 1125 finally: 1126 tree.xmlFree(c_base) 1127 return base 1128 1129 def __set__(self, url): 1130 _assertValidNode(self) 1131 if url is None: 1132 c_base = <const_xmlChar*>NULL 1133 else: 1134 url = _encodeFilename(url) 1135 c_base = _xcstr(url) 1136 tree.xmlNodeSetBase(self._c_node, c_base) 1137 1138 # ACCESSORS 1139 def __repr__(self): 1140 u"__repr__(self)" 1141 return "<Element %s at 0x%x>" % (strrepr(self.tag), id(self)) 1142 1143 def __getitem__(self, x): 1144 u"""Returns the subelement at the given position or the requested 1145 slice. 1146 """ 1147 cdef xmlNode* c_node = NULL 1148 cdef Py_ssize_t step = 0, slicelength = 0 1149 cdef Py_ssize_t c, i 1150 cdef _node_to_node_function next_element 1151 cdef list result 1152 _assertValidNode(self) 1153 if isinstance(x, slice): 1154 # slicing 1155 if _isFullSlice(<slice>x): 1156 return _collectChildren(self) 1157 _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength) 1158 if c_node is NULL: 1159 return [] 1160 if step > 0: 1161 next_element = _nextElement 1162 else: 1163 step = -step 1164 next_element = _previousElement 1165 result = [] 1166 c = 0 1167 while c_node is not NULL and c < slicelength: 1168 result.append(_elementFactory(self._doc, c_node)) 1169 c += 1 1170 for i in range(step): 1171 c_node = next_element(c_node) 1172 if c_node is NULL: 1173 break 1174 return result 1175 else: 1176 # indexing 1177 c_node = _findChild(self._c_node, x) 1178 if c_node is NULL: 1179 raise IndexError, u"list index out of range" 1180 return _elementFactory(self._doc, c_node) 1181 1182 def __len__(self): 1183 u"""__len__(self) 1184 1185 Returns the number of subelements. 1186 """ 1187 _assertValidNode(self) 1188 return _countElements(self._c_node.children) 1189 1190 def __nonzero__(self): 1191 #u"__nonzero__(self)" # currently fails in Py3.1 1192 import warnings 1193 warnings.warn( 1194 u"The behavior of this method will change in future versions. " 1195 u"Use specific 'len(elem)' or 'elem is not None' test instead.", 1196 FutureWarning 1197 ) 1198 # emulate old behaviour 1199 _assertValidNode(self) 1200 return _hasChild(self._c_node) 1201 1202 def __contains__(self, element): 1203 u"__contains__(self, element)" 1204 cdef xmlNode* c_node 1205 _assertValidNode(self) 1206 if not isinstance(element, _Element): 1207 return 0 1208 c_node = (<_Element>element)._c_node 1209 return c_node is not NULL and c_node.parent is self._c_node 1210 1211 def __iter__(self): 1212 u"__iter__(self)" 1213 return ElementChildIterator(self) 1214 1215 def __reversed__(self): 1216 u"__reversed__(self)" 1217 return ElementChildIterator(self, reversed=True) 1218 1219 def index(self, _Element child not None, start: int = None, stop: int = None): 1220 u"""index(self, child, start=None, stop=None) 1221 1222 Find the position of the child within the parent. 1223 1224 This method is not part of the original ElementTree API. 1225 """ 1226 cdef Py_ssize_t k, l 1227 cdef Py_ssize_t c_start, c_stop 1228 cdef xmlNode* c_child 1229 cdef xmlNode* c_start_node 1230 _assertValidNode(self) 1231 _assertValidNode(child) 1232 c_child = child._c_node 1233 if c_child.parent is not self._c_node: 1234 raise ValueError, u"Element is not a child of this node." 1235 1236 # handle the unbounded search straight away (normal case) 1237 if stop is None and (start is None or start == 0): 1238 k = 0 1239 c_child = c_child.prev 1240 while c_child is not NULL: 1241 if _isElement(c_child): 1242 k += 1 1243 c_child = c_child.prev 1244 return k 1245 1246 # check indices 1247 if start is None: 1248 c_start = 0 1249 else: 1250 c_start = start 1251 if stop is None: 1252 c_stop = 0 1253 else: 1254 c_stop = stop 1255 if c_stop == 0 or \ 1256 c_start >= c_stop and (c_stop > 0 or c_start < 0): 1257 raise ValueError, u"list.index(x): x not in slice" 1258 1259 # for negative slice indices, check slice before searching index 1260 if c_start < 0 or c_stop < 0: 1261 # start from right, at most up to leftmost(c_start, c_stop) 1262 if c_start < c_stop: 1263 k = -c_start 1264 else: 1265 k = -c_stop 1266 c_start_node = self._c_node.last 1267 l = 1 1268 while c_start_node != c_child and l < k: 1269 if _isElement(c_start_node): 1270 l += 1 1271 c_start_node = c_start_node.prev 1272 if c_start_node == c_child: 1273 # found! before slice end? 1274 if c_stop < 0 and l <= -c_stop: 1275 raise ValueError, u"list.index(x): x not in slice" 1276 elif c_start < 0: 1277 raise ValueError, u"list.index(x): x not in slice" 1278 1279 # now determine the index backwards from child 1280 c_child = c_child.prev 1281 k = 0 1282 if c_stop > 0: 1283 # we can optimize: stop after c_stop elements if not found 1284 while c_child != NULL and k < c_stop: 1285 if _isElement(c_child): 1286 k += 1 1287 c_child = c_child.prev 1288 if k < c_stop: 1289 return k 1290 else: 1291 # traverse all 1292 while c_child != NULL: 1293 if _isElement(c_child): 1294 k = k + 1 1295 c_child = c_child.prev 1296 if c_start > 0: 1297 if k >= c_start: 1298 return k 1299 else: 1300 return k 1301 if c_start != 0 or c_stop != 0: 1302 raise ValueError, u"list.index(x): x not in slice" 1303 else: 1304 raise ValueError, u"list.index(x): x not in list" 1305 1306 def get(self, key, default=None): 1307 u"""get(self, key, default=None) 1308 1309 Gets an element attribute. 1310 """ 1311 _assertValidNode(self) 1312 return _getAttributeValue(self, key, default) 1313 1314 def keys(self): 1315 u"""keys(self) 1316 1317 Gets a list of attribute names. The names are returned in an 1318 arbitrary order (just like for an ordinary Python dictionary). 1319 """ 1320 _assertValidNode(self) 1321 return _collectAttributes(self._c_node, 1) 1322 1323 def values(self): 1324 u"""values(self) 1325 1326 Gets element attribute values as a sequence of strings. The 1327 attributes are returned in an arbitrary order. 1328 """ 1329 _assertValidNode(self) 1330 return _collectAttributes(self._c_node, 2) 1331 1332 def items(self): 1333 u"""items(self) 1334 1335 Gets element attributes, as a sequence. The attributes are returned in 1336 an arbitrary order. 1337 """ 1338 _assertValidNode(self) 1339 return _collectAttributes(self._c_node, 3) 1340 1341 def getchildren(self): 1342 u"""getchildren(self) 1343 1344 Returns all direct children. The elements are returned in document 1345 order. 1346 1347 :deprecated: Note that this method has been deprecated as of 1348 ElementTree 1.3 and lxml 2.0. New code should use 1349 ``list(element)`` or simply iterate over elements. 1350 """ 1351 _assertValidNode(self) 1352 return _collectChildren(self) 1353 1354 def getparent(self): 1355 u"""getparent(self) 1356 1357 Returns the parent of this element or None for the root element. 1358 """ 1359 cdef xmlNode* c_node 1360 #_assertValidNode(self) # not needed 1361 c_node = _parentElement(self._c_node) 1362 if c_node is NULL: 1363 return None 1364 return _elementFactory(self._doc, c_node) 1365 1366 def getnext(self): 1367 u"""getnext(self) 1368 1369 Returns the following sibling of this element or None. 1370 """ 1371 cdef xmlNode* c_node 1372 #_assertValidNode(self) # not needed 1373 c_node = _nextElement(self._c_node) 1374 if c_node is NULL: 1375 return None 1376 return _elementFactory(self._doc, c_node) 1377 1378 def getprevious(self): 1379 u"""getprevious(self) 1380 1381 Returns the preceding sibling of this element or None. 1382 """ 1383 cdef xmlNode* c_node 1384 #_assertValidNode(self) # not needed 1385 c_node = _previousElement(self._c_node) 1386 if c_node is NULL: 1387 return None 1388 return _elementFactory(self._doc, c_node) 1389 1390 def itersiblings(self, tag=None, *tags, preceding=False): 1391 u"""itersiblings(self, tag=None, *tags, preceding=False) 1392 1393 Iterate over the following or preceding siblings of this element. 1394 1395 The direction is determined by the 'preceding' keyword which 1396 defaults to False, i.e. forward iteration over the following 1397 siblings. When True, the iterator yields the preceding 1398 siblings in reverse document order, i.e. starting right before 1399 the current element and going backwards. 1400 1401 Can be restricted to find only elements with specific tags, 1402 see `iter`. 1403 """ 1404 if preceding: 1405 if self._c_node and not self._c_node.prev: 1406 return ITER_EMPTY 1407 elif self._c_node and not self._c_node.next: 1408 return ITER_EMPTY 1409 if tag is not None: 1410 tags += (tag,) 1411 return SiblingsIterator(self, tags, preceding=preceding) 1412 1413 def iterancestors(self, tag=None, *tags): 1414 u"""iterancestors(self, tag=None, *tags) 1415 1416 Iterate over the ancestors of this element (from parent to parent). 1417 1418 Can be restricted to find only elements with specific tags, 1419 see `iter`. 1420 """ 1421 if self._c_node and not self._c_node.parent: 1422 return ITER_EMPTY 1423 if tag is not None: 1424 tags += (tag,) 1425 return AncestorsIterator(self, tags) 1426 1427 def iterdescendants(self, tag=None, *tags): 1428 u"""iterdescendants(self, tag=None, *tags) 1429 1430 Iterate over the descendants of this element in document order. 1431 1432 As opposed to ``el.iter()``, this iterator does not yield the element 1433 itself. The returned elements can be restricted to find only elements 1434 with specific tags, see `iter`. 1435 """ 1436 if self._c_node and not self._c_node.children: 1437 return ITER_EMPTY 1438 if tag is not None: 1439 tags += (tag,) 1440 return ElementDepthFirstIterator(self, tags, inclusive=False) 1441 1442 def iterchildren(self, tag=None, *tags, reversed=False): 1443 u"""iterchildren(self, tag=None, *tags, reversed=False) 1444 1445 Iterate over the children of this element. 1446 1447 As opposed to using normal iteration on this element, the returned 1448 elements can be reversed with the 'reversed' keyword and restricted 1449 to find only elements with specific tags, see `iter`. 1450 """ 1451 if self._c_node and not self._c_node.children: 1452 return ITER_EMPTY 1453 if tag is not None: 1454 tags += (tag,) 1455 return ElementChildIterator(self, tags, reversed=reversed) 1456 1457 def getroottree(self): 1458 u"""getroottree(self) 1459 1460 Return an ElementTree for the root node of the document that 1461 contains this element. 1462 1463 This is the same as following element.getparent() up the tree until it 1464 returns None (for the root element) and then build an ElementTree for 1465 the last parent that was returned.""" 1466 _assertValidDoc(self._doc) 1467 return _elementTreeFactory(self._doc, None) 1468 1469 def getiterator(self, tag=None, *tags): 1470 u"""getiterator(self, tag=None, *tags) 1471 1472 Returns a sequence or iterator of all elements in the subtree in 1473 document order (depth first pre-order), starting with this 1474 element. 1475 1476 Can be restricted to find only elements with specific tags, 1477 see `iter`. 1478 1479 :deprecated: Note that this method is deprecated as of 1480 ElementTree 1.3 and lxml 2.0. It returns an iterator in 1481 lxml, which diverges from the original ElementTree 1482 behaviour. If you want an efficient iterator, use the 1483 ``element.iter()`` method instead. You should only use this 1484 method in new code if you require backwards compatibility 1485 with older versions of lxml or ElementTree. 1486 """ 1487 if tag is not None: 1488 tags += (tag,) 1489 return ElementDepthFirstIterator(self, tags) 1490 1491 def iter(self, tag=None, *tags): 1492 u"""iter(self, tag=None, *tags) 1493 1494 Iterate over all elements in the subtree in document order (depth 1495 first pre-order), starting with this element. 1496 1497 Can be restricted to find only elements with specific tags: 1498 pass ``"{ns}localname"`` as tag. Either or both of ``ns`` and 1499 ``localname`` can be ``*`` for a wildcard; ``ns`` can be empty 1500 for no namespace. ``"localname"`` is equivalent to ``"{}localname"`` 1501 (i.e. no namespace) but ``"*"`` is ``"{*}*"`` (any or no namespace), 1502 not ``"{}*"``. 1503 1504 You can also pass the Element, Comment, ProcessingInstruction and 1505 Entity factory functions to look only for the specific element type. 1506 1507 Passing multiple tags (or a sequence of tags) instead of a single tag 1508 will let the iterator return all elements matching any of these tags, 1509 in document order. 1510 """ 1511 if tag is not None: 1512 tags += (tag,) 1513 return ElementDepthFirstIterator(self, tags) 1514 1515 def itertext(self, tag=None, *tags, with_tail=True): 1516 u"""itertext(self, tag=None, *tags, with_tail=True) 1517 1518 Iterates over the text content of a subtree. 1519 1520 You can pass tag names to restrict text content to specific elements, 1521 see `iter`. 1522 1523 You can set the ``with_tail`` keyword argument to ``False`` to skip 1524 over tail text. 1525 """ 1526 if tag is not None: 1527 tags += (tag,) 1528 return ElementTextIterator(self, tags, with_tail=with_tail) 1529 1530 def makeelement(self, _tag, attrib=None, nsmap=None, **_extra): 1531 u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra) 1532 1533 Creates a new element associated with the same document. 1534 """ 1535 _assertValidDoc(self._doc) 1536 return _makeElement(_tag, NULL, self._doc, None, None, None, 1537 attrib, nsmap, _extra) 1538 1539 def find(self, path, namespaces=None): 1540 u"""find(self, path, namespaces=None) 1541 1542 Finds the first matching subelement, by tag name or path. 1543 1544 The optional ``namespaces`` argument accepts a 1545 prefix-to-namespace mapping that allows the usage of XPath 1546 prefixes in the path expression. 1547 """ 1548 if isinstance(path, QName): 1549 path = (<QName>path).text 1550 return _elementpath.find(self, path, namespaces) 1551 1552 def findtext(self, path, default=None, namespaces=None): 1553 u"""findtext(self, path, default=None, namespaces=None) 1554 1555 Finds text for the first matching subelement, by tag name or path. 1556 1557 The optional ``namespaces`` argument accepts a 1558 prefix-to-namespace mapping that allows the usage of XPath 1559 prefixes in the path expression. 1560 """ 1561 if isinstance(path, QName): 1562 path = (<QName>path).text 1563 return _elementpath.findtext(self, path, default, namespaces) 1564 1565 def findall(self, path, namespaces=None): 1566 u"""findall(self, path, namespaces=None) 1567 1568 Finds all matching subelements, by tag name or path. 1569 1570 The optional ``namespaces`` argument accepts a 1571 prefix-to-namespace mapping that allows the usage of XPath 1572 prefixes in the path expression. 1573 """ 1574 if isinstance(path, QName): 1575 path = (<QName>path).text 1576 return _elementpath.findall(self, path, namespaces) 1577 1578 def iterfind(self, path, namespaces=None): 1579 u"""iterfind(self, path, namespaces=None) 1580 1581 Iterates over all matching subelements, by tag name or path. 1582 1583 The optional ``namespaces`` argument accepts a 1584 prefix-to-namespace mapping that allows the usage of XPath 1585 prefixes in the path expression. 1586 """ 1587 if isinstance(path, QName): 1588 path = (<QName>path).text 1589 return _elementpath.iterfind(self, path, namespaces) 1590 1591 def xpath(self, _path, *, namespaces=None, extensions=None, 1592 smart_strings=True, **_variables): 1593 u"""xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables) 1594 1595 Evaluate an xpath expression using the element as context node. 1596 """ 1597 evaluator = XPathElementEvaluator(self, namespaces=namespaces, 1598 extensions=extensions, 1599 smart_strings=smart_strings) 1600 return evaluator(_path, **_variables) 1601 1602 def cssselect(self, expr, *, translator='xml'): 1603 """ 1604 Run the CSS expression on this element and its children, 1605 returning a list of the results. 1606 1607 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 1608 that pre-compiling the expression can provide a substantial 1609 speedup. 1610 """ 1611 # Do the import here to make the dependency optional. 1612 from lxml.cssselect import CSSSelector 1613 return CSSSelector(expr, translator=translator)(self) 1614 1615 1616 cdef extern from "includes/etree_defs.h": 1617 # macro call to 't->tp_new()' for fast instantiation 1618 cdef object NEW_ELEMENT "PY_NEW" (object t) 1619 1620 1621 @cython.linetrace(False) 1622 cdef _Element _elementFactory(_Document doc, xmlNode* c_node): 1623 cdef _Element result 1624 result = getProxy(c_node) 1625 if result is not None: 1626 return result 1627 if c_node is NULL: 1628 return None 1629 1630 element_class = LOOKUP_ELEMENT_CLASS( 1631 ELEMENT_CLASS_LOOKUP_STATE, doc, c_node) 1632 if hasProxy(c_node): 1633 # prevent re-entry race condition - we just called into Python 1634 return getProxy(c_node) 1635 result = NEW_ELEMENT(element_class) 1636 if hasProxy(c_node): 1637 # prevent re-entry race condition - we just called into Python 1638 result._c_node = NULL 1639 return getProxy(c_node) 1640 1641 _registerProxy(result, doc, c_node) 1642 if element_class is not _Element: 1643 result._init() 1644 return result 1645 1646 1647 @cython.internal 1648 cdef class __ContentOnlyElement(_Element): 1649 cdef int _raiseImmutable(self) except -1: 1650 raise TypeError, u"this element does not have children or attributes" 1651 1652 def set(self, key, value): 1653 u"set(self, key, value)" 1654 self._raiseImmutable() 1655 1656 def append(self, value): 1657 u"append(self, value)" 1658 self._raiseImmutable() 1659 1660 def insert(self, index, value): 1661 u"insert(self, index, value)" 1662 self._raiseImmutable() 1663 1664 def __setitem__(self, index, value): 1665 u"__setitem__(self, index, value)" 1666 self._raiseImmutable() 1667 1668 @property 1669 def attrib(self): 1670 return IMMUTABLE_EMPTY_MAPPING 1671 1672 property text: 1673 def __get__(self): 1674 _assertValidNode(self) 1675 return funicodeOrEmpty(self._c_node.content) 1676 1677 def __set__(self, value): 1678 cdef tree.xmlDict* c_dict 1679 _assertValidNode(self) 1680 if value is None: 1681 c_text = <const_xmlChar*>NULL 1682 else: 1683 value = _utf8(value) 1684 c_text = _xcstr(value) 1685 tree.xmlNodeSetContent(self._c_node, c_text) 1686 1687 # ACCESSORS 1688 def __getitem__(self, x): 1689 u"__getitem__(self, x)" 1690 if isinstance(x, slice): 1691 return [] 1692 else: 1693 raise IndexError, u"list index out of range" 1694 1695 def __len__(self): 1696 u"__len__(self)" 1697 return 0 1698 1699 def get(self, key, default=None): 1700 u"get(self, key, default=None)" 1701 return None 1702 1703 def keys(self): 1704 u"keys(self)" 1705 return [] 1706 1707 def items(self): 1708 u"items(self)" 1709 return [] 1710 1711 def values(self): 1712 u"values(self)" 1713 return [] 1714 1715 cdef class _Comment(__ContentOnlyElement): 1716 @property 1717 def tag(self): 1718 return Comment 1719 1720 def __repr__(self): 1721 return "<!--%s-->" % strrepr(self.text) 1722 1723 cdef class _ProcessingInstruction(__ContentOnlyElement): 1724 @property 1725 def tag(self): 1726 return ProcessingInstruction 1727 1728 property target: 1729 # not in ElementTree 1730 def __get__(self): 1731 _assertValidNode(self) 1732 return funicode(self._c_node.name) 1733 1734 def __set__(self, value): 1735 _assertValidNode(self) 1736 value = _utf8(value) 1737 c_text = _xcstr(value) 1738 tree.xmlNodeSetName(self._c_node, c_text) 1739 1740 def __repr__(self): 1741 text = self.text 1742 if text: 1743 return "<?%s %s?>" % (strrepr(self.target), 1744 strrepr(text)) 1745 else: 1746 return "<?%s?>" % strrepr(self.target) 1747 1748 def get(self, key, default=None): 1749 u"""get(self, key, default=None) 1750 1751 Try to parse pseudo-attributes from the text content of the 1752 processing instruction, search for one with the given key as 1753 name and return its associated value. 1754 1755 Note that this is only a convenience method for the most 1756 common case that all text content is structured in 1757 attribute-like name-value pairs with properly quoted values. 1758 It is not guaranteed to work for all possible text content. 1759 """ 1760 return self.attrib.get(key, default) 1761 1762 @property 1763 def attrib(self): 1764 """Returns a dict containing all pseudo-attributes that can be 1765 parsed from the text content of this processing instruction. 1766 Note that modifying the dict currently has no effect on the 1767 XML node, although this is not guaranteed to stay this way. 1768 """ 1769 return { attr : (value1 or value2) 1770 for attr, value1, value2 in _FIND_PI_ATTRIBUTES(u' ' + self.text) } 1771 1772 cdef object _FIND_PI_ATTRIBUTES = re.compile(ur'\s+(\w+)\s*=\s*(?:\'([^\']*)\'|"([^"]*)")', re.U).findall 1773 1774 cdef class _Entity(__ContentOnlyElement): 1775 @property 1776 def tag(self): 1777 return Entity 1778 1779 property name: 1780 # not in ElementTree 1781 def __get__(self): 1782 _assertValidNode(self) 1783 return funicode(self._c_node.name) 1784 1785 def __set__(self, value): 1786 _assertValidNode(self) 1787 value_utf = _utf8(value) 1788 if b'&' in value_utf or b';' in value_utf: 1789 raise ValueError, f"Invalid entity name '{value}'" 1790 tree.xmlNodeSetName(self._c_node, _xcstr(value_utf)) 1791 1792 @property 1793 def text(self): 1794 # FIXME: should this be None or '&[VALUE];' or the resolved 1795 # entity value ? 1796 _assertValidNode(self) 1797 return f'&{funicode(self._c_node.name)};' 1798 1799 def __repr__(self): 1800 return "&%s;" % strrepr(self.name) 1801 1802 1803 cdef class QName: 1804 u"""QName(text_or_uri_or_element, tag=None) 1805 1806 QName wrapper for qualified XML names. 1807 1808 Pass a tag name by itself or a namespace URI and a tag name to 1809 create a qualified name. Alternatively, pass an Element to 1810 extract its tag name. ``None`` as first argument is ignored in 1811 order to allow for generic 2-argument usage. 1812 1813 The ``text`` property holds the qualified name in 1814 ``{namespace}tagname`` notation. The ``namespace`` and 1815 ``localname`` properties hold the respective parts of the tag 1816 name. 1817 1818 You can pass QName objects wherever a tag name is expected. Also, 1819 setting Element text from a QName will resolve the namespace prefix 1820 on assignment and set a qualified text value. This is helpful in XML 1821 languages like SOAP or XML-Schema that use prefixed tag names in 1822 their text content. 1823 """ 1824 cdef readonly unicode text 1825 cdef readonly unicode localname 1826 cdef readonly unicode namespace 1827 def __init__(self, text_or_uri_or_element, tag=None): 1828 if text_or_uri_or_element is None: 1829 # Allow None as no namespace. 1830 text_or_uri_or_element, tag = tag, None 1831 if not _isString(text_or_uri_or_element): 1832 if isinstance(text_or_uri_or_element, _Element): 1833 text_or_uri_or_element = (<_Element>text_or_uri_or_element).tag 1834 if not _isString(text_or_uri_or_element): 1835 raise ValueError, f"Invalid input tag of type {type(text_or_uri_or_element)!r}" 1836 elif isinstance(text_or_uri_or_element, QName): 1837 text_or_uri_or_element = (<QName>text_or_uri_or_element).text 1838 elif text_or_uri_or_element is not None: 1839 text_or_uri_or_element = unicode(text_or_uri_or_element) 1840 else: 1841 raise ValueError, f"Invalid input tag of type {type(text_or_uri_or_element)!r}" 1842 1843 ns_utf, tag_utf = _getNsTag(text_or_uri_or_element) 1844 if tag is not None: 1845 # either ('ns', 'tag') or ('{ns}oldtag', 'newtag') 1846 if ns_utf is None: 1847 ns_utf = tag_utf # case 1: namespace ended up as tag name 1848 tag_utf = _utf8(tag) 1849 _tagValidOrRaise(tag_utf) 1850 self.localname = (<bytes>tag_utf).decode('utf8') 1851 if ns_utf is None: 1852 self.namespace = None 1853 self.text = self.localname 1854 else: 1855 self.namespace = (<bytes>ns_utf).decode('utf8') 1856 self.text = u"{%s}%s" % (self.namespace, self.localname) 1857 def __str__(self): 1858 return self.text 1859 def __hash__(self): 1860 return hash(self.text) 1861 def __richcmp__(self, other, int op): 1862 try: 1863 if type(other) is QName: 1864 other = (<QName>other).text 1865 elif not isinstance(other, unicode): 1866 other = unicode(other) 1867 except (ValueError, UnicodeDecodeError): 1868 return NotImplemented 1869 return python.PyObject_RichCompare(self.text, other, op) 1870 1871 1872 cdef public class _ElementTree [ type LxmlElementTreeType, 1873 object LxmlElementTree ]: 1874 cdef _Document _doc 1875 cdef _Element _context_node 1876 1877 # Note that _doc is only used to store the original document if we do not 1878 # have a _context_node. All methods should prefer self._context_node._doc 1879 # to honour tree restructuring. _doc can happily be None! 1880 1881 @cython.final 1882 cdef int _assertHasRoot(self) except -1: 1883 u"""We have to take care here: the document may not have a root node! 1884 This can happen if ElementTree() is called without any argument and 1885 the caller 'forgets' to call parse() afterwards, so this is a bug in 1886 the caller program. 1887 """ 1888 assert self._context_node is not None, \ 1889 u"ElementTree not initialized, missing root" 1890 return 0 1891 1892 def parse(self, source, _BaseParser parser=None, *, base_url=None): 1893 u"""parse(self, source, parser=None, base_url=None) 1894 1895 Updates self with the content of source and returns its root. 1896 """ 1897 cdef _Document doc = None 1898 try: 1899 doc = _parseDocument(source, parser, base_url) 1900 except _TargetParserResult as result_container: 1901 # raises a TypeError if we don't get an _Element 1902 self._context_node = result_container.result 1903 else: 1904 self._context_node = doc.getroot() 1905 self._doc = None if self._context_node is not None else doc 1906 return self._context_node 1907 1908 def _setroot(self, _Element root not None): 1909 u"""_setroot(self, root) 1910 1911 Relocate the ElementTree to a new root node. 1912 """ 1913 _assertValidNode(root) 1914 if root._c_node.type != tree.XML_ELEMENT_NODE: 1915 raise TypeError, u"Only elements can be the root of an ElementTree" 1916 self._context_node = root 1917 self._doc = None 1918 1919 def getroot(self): 1920 u"""getroot(self) 1921 1922 Gets the root element for this tree. 1923 """ 1924 return self._context_node 1925 1926 def __copy__(self): 1927 return _elementTreeFactory(self._doc, self._context_node) 1928 1929 def __deepcopy__(self, memo): 1930 cdef _Element root 1931 cdef _Document doc 1932 cdef xmlDoc* c_doc 1933 if self._context_node is not None: 1934 root = self._context_node.__copy__() 1935 assert root is not None 1936 _assertValidNode(root) 1937 _copyNonElementSiblings(self._context_node._c_node, root._c_node) 1938 return _elementTreeFactory(None, root) 1939 elif self._doc is not None: 1940 _assertValidDoc(self._doc) 1941 c_doc = tree.xmlCopyDoc(self._doc._c_doc, 1) 1942 if c_doc is NULL: 1943 raise MemoryError() 1944 doc = _documentFactory(c_doc, self._doc._parser) 1945 return _elementTreeFactory(doc, None) 1946 else: 1947 # so what ... 1948 return self 1949 1950 # not in ElementTree 1951 @property 1952 def docinfo(self) -> DocInfo: 1953 """Information about the document provided by parser and DTD.""" 1954 self._assertHasRoot() 1955 return DocInfo(self._context_node._doc) 1956 1957 # not in ElementTree, read-only 1958 @property 1959 def parser(self): 1960 """The parser that was used to parse the document in this ElementTree. 1961 """ 1962 if self._context_node is not None and \ 1963 self._context_node._doc is not None: 1964 return self._context_node._doc._parser 1965 if self._doc is not None: 1966 return self._doc._parser 1967 return None 1968 1969 def write(self, file, *, encoding=None, method="xml", 1970 bint pretty_print=False, xml_declaration=None, bint with_tail=True, 1971 standalone=None, doctype=None, compression=0, 1972 bint exclusive=False, inclusive_ns_prefixes=None, 1973 bint with_comments=True, bint strip_text=False, 1974 docstring=None): 1975 u"""write(self, file, encoding=None, method="xml", 1976 pretty_print=False, xml_declaration=None, with_tail=True, 1977 standalone=None, doctype=None, compression=0, 1978 exclusive=False, inclusive_ns_prefixes=None, 1979 with_comments=True, strip_text=False) 1980 1981 Write the tree to a filename, file or file-like object. 1982 1983 Defaults to ASCII encoding and writing a declaration as needed. 1984 1985 The keyword argument 'method' selects the output method: 1986 'xml', 'html', 'text' or 'c14n'. Default is 'xml'. 1987 1988 With ``method="c14n"`` (C14N version 1), the options ``exclusive``, 1989 ``with_comments`` and ``inclusive_ns_prefixes`` request exclusive 1990 C14N, include comments, and list the inclusive prefixes respectively. 1991 1992 With ``method="c14n2"`` (C14N version 2), the ``with_comments`` and 1993 ``strip_text`` options control the output of comments and text space 1994 according to C14N 2.0. 1995 1996 Passing a boolean value to the ``standalone`` option will 1997 output an XML declaration with the corresponding 1998 ``standalone`` flag. 1999 2000 The ``doctype`` option allows passing in a plain string that will 2001 be serialised before the XML tree. Note that passing in non 2002 well-formed content here will make the XML output non well-formed. 2003 Also, an existing doctype in the document tree will not be removed 2004 when serialising an ElementTree instance. 2005 2006 The ``compression`` option enables GZip compression level 1-9. 2007 2008 The ``inclusive_ns_prefixes`` should be a list of namespace strings 2009 (i.e. ['xs', 'xsi']) that will be promoted to the top-level element 2010 during exclusive C14N serialisation. This parameter is ignored if 2011 exclusive mode=False. 2012 2013 If exclusive=True and no list is provided, a namespace will only be 2014 rendered if it is used by the immediate parent or one of its attributes 2015 and its prefix and values have not already been rendered by an ancestor 2016 of the namespace node's parent element. 2017 """ 2018 cdef bint write_declaration 2019 cdef int is_standalone 2020 2021 self._assertHasRoot() 2022 _assertValidNode(self._context_node) 2023 if compression is None or compression < 0: 2024 compression = 0 2025 2026 # C14N serialisation 2027 if method in ('c14n', 'c14n2'): 2028 if encoding is not None: 2029 raise ValueError("Cannot specify encoding with C14N") 2030 if xml_declaration: 2031 raise ValueError("Cannot enable XML declaration in C14N") 2032 2033 if method == 'c14n': 2034 _tofilelikeC14N(file, self._context_node, exclusive, with_comments, 2035 compression, inclusive_ns_prefixes) 2036 else: # c14n2 2037 with _open_utf8_file(file, compression=compression) as f: 2038 target = C14NWriterTarget( 2039 f.write, with_comments=with_comments, strip_text=strip_text) 2040 _tree_to_target(self, target) 2041 return 2042 2043 if not with_comments: 2044 raise ValueError("Can only discard comments in C14N serialisation") 2045 # suppress decl. in default case (purely for ElementTree compatibility) 2046 if xml_declaration is not None: 2047 write_declaration = xml_declaration 2048 if encoding is None: 2049 encoding = 'ASCII' 2050 else: 2051 encoding = encoding.upper() 2052 elif encoding is None: 2053 encoding = 'ASCII' 2054 write_declaration = 0 2055 else: 2056 encoding = encoding.upper() 2057 write_declaration = encoding not in ( 2058 'US-ASCII', 'ASCII', 'UTF8', 'UTF-8') 2059 if standalone is None: 2060 is_standalone = -1 2061 elif standalone: 2062 write_declaration = 1 2063 is_standalone = 1 2064 else: 2065 write_declaration = 1 2066 is_standalone = 0 2067 2068 if docstring is not None and doctype is None: 2069 import warnings 2070 warnings.warn( 2071 "The 'docstring' option is deprecated. Use 'doctype' instead.", 2072 DeprecationWarning) 2073 doctype = docstring 2074 2075 _tofilelike(file, self._context_node, encoding, doctype, method, 2076 write_declaration, 1, pretty_print, with_tail, 2077 is_standalone, compression) 2078 2079 def getpath(self, _Element element not None): 2080 u"""getpath(self, element) 2081 2082 Returns a structural, absolute XPath expression to find the element. 2083 2084 For namespaced elements, the expression uses prefixes from the 2085 document, which therefore need to be provided in order to make any 2086 use of the expression in XPath. 2087 2088 Also see the method getelementpath(self, element), which returns a 2089 self-contained ElementPath expression. 2090 """ 2091 cdef _Document doc 2092 cdef _Element root 2093 cdef xmlDoc* c_doc 2094 _assertValidNode(element) 2095 if self._context_node is not None: 2096 root = self._context_node 2097 doc = root._doc 2098 elif self._doc is not None: 2099 doc = self._doc 2100 root = doc.getroot() 2101 else: 2102 raise ValueError, u"Element is not in this tree." 2103 _assertValidDoc(doc) 2104 _assertValidNode(root) 2105 if element._doc is not doc: 2106 raise ValueError, u"Element is not in this tree." 2107 2108 c_doc = _fakeRootDoc(doc._c_doc, root._c_node) 2109 c_path = tree.xmlGetNodePath(element._c_node) 2110 _destroyFakeDoc(doc._c_doc, c_doc) 2111 if c_path is NULL: 2112 raise MemoryError() 2113 path = funicode(c_path) 2114 tree.xmlFree(c_path) 2115 return path 2116 2117 def getelementpath(self, _Element element not None): 2118 u"""getelementpath(self, element) 2119 2120 Returns a structural, absolute ElementPath expression to find the 2121 element. This path can be used in the .find() method to look up 2122 the element, provided that the elements along the path and their 2123 list of immediate children were not modified in between. 2124 2125 ElementPath has the advantage over an XPath expression (as returned 2126 by the .getpath() method) that it does not require additional prefix 2127 declarations. It is always self-contained. 2128 """ 2129 cdef _Element root 2130 cdef Py_ssize_t count 2131 _assertValidNode(element) 2132 if element._c_node.type != tree.XML_ELEMENT_NODE: 2133 raise ValueError, u"input is not an Element" 2134 if self._context_node is not None: 2135 root = self._context_node 2136 elif self._doc is not None: 2137 root = self._doc.getroot() 2138 else: 2139 raise ValueError, u"Element is not in this tree" 2140 _assertValidNode(root) 2141 if element._doc is not root._doc: 2142 raise ValueError, u"Element is not in this tree" 2143 2144 path = [] 2145 c_element = element._c_node 2146 while c_element is not root._c_node: 2147 c_name = c_element.name 2148 c_href = _getNs(c_element) 2149 tag = _namespacedNameFromNsName(c_href, c_name) 2150 if c_href is NULL: 2151 c_href = <const_xmlChar*>b'' # no namespace (NULL is wildcard) 2152 # use tag[N] if there are preceding siblings with the same tag 2153 count = 0 2154 c_node = c_element.prev 2155 while c_node is not NULL: 2156 if c_node.type == tree.XML_ELEMENT_NODE: 2157 if _tagMatches(c_node, c_href, c_name): 2158 count += 1 2159 c_node = c_node.prev 2160 if count: 2161 tag = f'{tag}[{count+1}]' 2162 else: 2163 # use tag[1] if there are following siblings with the same tag 2164 c_node = c_element.next 2165 while c_node is not NULL: 2166 if c_node.type == tree.XML_ELEMENT_NODE: 2167 if _tagMatches(c_node, c_href, c_name): 2168 tag += '[1]' 2169 break 2170 c_node = c_node.next 2171 2172 path.append(tag) 2173 c_element = c_element.parent 2174 if c_element is NULL or c_element.type != tree.XML_ELEMENT_NODE: 2175 raise ValueError, u"Element is not in this tree." 2176 if not path: 2177 return '.' 2178 path.reverse() 2179 return '/'.join(path) 2180 2181 def getiterator(self, tag=None, *tags): 2182 u"""getiterator(self, *tags, tag=None) 2183 2184 Returns a sequence or iterator of all elements in document order 2185 (depth first pre-order), starting with the root element. 2186 2187 Can be restricted to find only elements with specific tags, 2188 see `_Element.iter`. 2189 2190 :deprecated: Note that this method is deprecated as of 2191 ElementTree 1.3 and lxml 2.0. It returns an iterator in 2192 lxml, which diverges from the original ElementTree 2193 behaviour. If you want an efficient iterator, use the 2194 ``tree.iter()`` method instead. You should only use this 2195 method in new code if you require backwards compatibility 2196 with older versions of lxml or ElementTree. 2197 """ 2198 root = self.getroot() 2199 if root is None: 2200 return ITER_EMPTY 2201 if tag is not None: 2202 tags += (tag,) 2203 return root.getiterator(*tags) 2204 2205 def iter(self, tag=None, *tags): 2206 u"""iter(self, tag=None, *tags) 2207 2208 Creates an iterator for the root element. The iterator loops over 2209 all elements in this tree, in document order. Note that siblings 2210 of the root element (comments or processing instructions) are not 2211 returned by the iterator. 2212 2213 Can be restricted to find only elements with specific tags, 2214 see `_Element.iter`. 2215 """ 2216 root = self.getroot() 2217 if root is None: 2218 return ITER_EMPTY 2219 if tag is not None: 2220 tags += (tag,) 2221 return root.iter(*tags) 2222 2223 def find(self, path, namespaces=None): 2224 u"""find(self, path, namespaces=None) 2225 2226 Finds the first toplevel element with given tag. Same as 2227 ``tree.getroot().find(path)``. 2228 2229 The optional ``namespaces`` argument accepts a 2230 prefix-to-namespace mapping that allows the usage of XPath 2231 prefixes in the path expression. 2232 """ 2233 self._assertHasRoot() 2234 root = self.getroot() 2235 if _isString(path): 2236 if path[:1] == "/": 2237 path = "." + path 2238 return root.find(path, namespaces) 2239 2240 def findtext(self, path, default=None, namespaces=None): 2241 u"""findtext(self, path, default=None, namespaces=None) 2242 2243 Finds the text for the first element matching the ElementPath 2244 expression. Same as getroot().findtext(path) 2245 2246 The optional ``namespaces`` argument accepts a 2247 prefix-to-namespace mapping that allows the usage of XPath 2248 prefixes in the path expression. 2249 """ 2250 self._assertHasRoot() 2251 root = self.getroot() 2252 if _isString(path): 2253 if path[:1] == "/": 2254 path = "." + path 2255 return root.findtext(path, default, namespaces) 2256 2257 def findall(self, path, namespaces=None): 2258 u"""findall(self, path, namespaces=None) 2259 2260 Finds all elements matching the ElementPath expression. Same as 2261 getroot().findall(path). 2262 2263 The optional ``namespaces`` argument accepts a 2264 prefix-to-namespace mapping that allows the usage of XPath 2265 prefixes in the path expression. 2266 """ 2267 self._assertHasRoot() 2268 root = self.getroot() 2269 if _isString(path): 2270 if path[:1] == "/": 2271 path = "." + path 2272 return root.findall(path, namespaces) 2273 2274 def iterfind(self, path, namespaces=None): 2275 u"""iterfind(self, path, namespaces=None) 2276 2277 Iterates over all elements matching the ElementPath expression. 2278 Same as getroot().iterfind(path). 2279 2280 The optional ``namespaces`` argument accepts a 2281 prefix-to-namespace mapping that allows the usage of XPath 2282 prefixes in the path expression. 2283 """ 2284 self._assertHasRoot() 2285 root = self.getroot() 2286 if _isString(path): 2287 if path[:1] == "/": 2288 path = "." + path 2289 return root.iterfind(path, namespaces) 2290 2291 def xpath(self, _path, *, namespaces=None, extensions=None, 2292 smart_strings=True, **_variables): 2293 u"""xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables) 2294 2295 XPath evaluate in context of document. 2296 2297 ``namespaces`` is an optional dictionary with prefix to namespace URI 2298 mappings, used by XPath. ``extensions`` defines additional extension 2299 functions. 2300 2301 Returns a list (nodeset), or bool, float or string. 2302 2303 In case of a list result, return Element for element nodes, 2304 string for text and attribute values. 2305 2306 Note: if you are going to apply multiple XPath expressions 2307 against the same document, it is more efficient to use 2308 XPathEvaluator directly. 2309 """ 2310 self._assertHasRoot() 2311 evaluator = XPathDocumentEvaluator(self, namespaces=namespaces, 2312 extensions=extensions, 2313 smart_strings=smart_strings) 2314 return evaluator(_path, **_variables) 2315 2316 def xslt(self, _xslt, extensions=None, access_control=None, **_kw): 2317 u"""xslt(self, _xslt, extensions=None, access_control=None, **_kw) 2318 2319 Transform this document using other document. 2320 2321 xslt is a tree that should be XSLT 2322 keyword parameters are XSLT transformation parameters. 2323 2324 Returns the transformed tree. 2325 2326 Note: if you are going to apply the same XSLT stylesheet against 2327 multiple documents, it is more efficient to use the XSLT 2328 class directly. 2329 """ 2330 self._assertHasRoot() 2331 style = XSLT(_xslt, extensions=extensions, 2332 access_control=access_control) 2333 return style(self, **_kw) 2334 2335 def relaxng(self, relaxng): 2336 u"""relaxng(self, relaxng) 2337 2338 Validate this document using other document. 2339 2340 The relaxng argument is a tree that should contain a Relax NG schema. 2341 2342 Returns True or False, depending on whether validation 2343 succeeded. 2344 2345 Note: if you are going to apply the same Relax NG schema against 2346 multiple documents, it is more efficient to use the RelaxNG 2347 class directly. 2348 """ 2349 self._assertHasRoot() 2350 schema = RelaxNG(relaxng) 2351 return schema.validate(self) 2352 2353 def xmlschema(self, xmlschema): 2354 u"""xmlschema(self, xmlschema) 2355 2356 Validate this document using other document. 2357 2358 The xmlschema argument is a tree that should contain an XML Schema. 2359 2360 Returns True or False, depending on whether validation 2361 succeeded. 2362 2363 Note: If you are going to apply the same XML Schema against 2364 multiple documents, it is more efficient to use the XMLSchema 2365 class directly. 2366 """ 2367 self._assertHasRoot() 2368 schema = XMLSchema(xmlschema) 2369 return schema.validate(self) 2370 2371 def xinclude(self): 2372 u"""xinclude(self) 2373 2374 Process the XInclude nodes in this document and include the 2375 referenced XML fragments. 2376 2377 There is support for loading files through the file system, HTTP and 2378 FTP. 2379 2380 Note that XInclude does not support custom resolvers in Python space 2381 due to restrictions of libxml2 <= 2.6.29. 2382 """ 2383 self._assertHasRoot() 2384 XInclude()(self._context_node) 2385 2386 def write_c14n(self, file, *, bint exclusive=False, bint with_comments=True, 2387 compression=0, inclusive_ns_prefixes=None): 2388 u"""write_c14n(self, file, exclusive=False, with_comments=True, 2389 compression=0, inclusive_ns_prefixes=None) 2390 2391 C14N write of document. Always writes UTF-8. 2392 2393 The ``compression`` option enables GZip compression level 1-9. 2394 2395 The ``inclusive_ns_prefixes`` should be a list of namespace strings 2396 (i.e. ['xs', 'xsi']) that will be promoted to the top-level element 2397 during exclusive C14N serialisation. This parameter is ignored if 2398 exclusive mode=False. 2399 2400 If exclusive=True and no list is provided, a namespace will only be 2401 rendered if it is used by the immediate parent or one of its attributes 2402 and its prefix and values have not already been rendered by an ancestor 2403 of the namespace node's parent element. 2404 2405 NOTE: This method is deprecated as of lxml 4.4 and will be removed in a 2406 future release. Use ``.write(f, method="c14n")`` instead. 2407 """ 2408 self._assertHasRoot() 2409 _assertValidNode(self._context_node) 2410 if compression is None or compression < 0: 2411 compression = 0 2412 2413 _tofilelikeC14N(file, self._context_node, exclusive, with_comments, 2414 compression, inclusive_ns_prefixes) 2415 2416 cdef _ElementTree _elementTreeFactory(_Document doc, _Element context_node): 2417 return _newElementTree(doc, context_node, _ElementTree) 2418 2419 cdef _ElementTree _newElementTree(_Document doc, _Element context_node, 2420 object baseclass): 2421 cdef _ElementTree result 2422 result = baseclass() 2423 if context_node is None and doc is not None: 2424 context_node = doc.getroot() 2425 if context_node is None: 2426 _assertValidDoc(doc) 2427 result._doc = doc 2428 else: 2429 _assertValidNode(context_node) 2430 result._context_node = context_node 2431 return result 2432 2433 2434 @cython.final 2435 @cython.freelist(16) 2436 cdef class _Attrib: 2437 u"""A dict-like proxy for the ``Element.attrib`` property. 2438 """ 2439 cdef _Element _element 2440 def __cinit__(self, _Element element not None): 2441 _assertValidNode(element) 2442 self._element = element 2443 2444 # MANIPULATORS 2445 def __setitem__(self, key, value): 2446 _assertValidNode(self._element) 2447 _setAttributeValue(self._element, key, value) 2448 2449 def __delitem__(self, key): 2450 _assertValidNode(self._element) 2451 _delAttribute(self._element, key) 2452 2453 def update(self, sequence_or_dict): 2454 _assertValidNode(self._element) 2455 if isinstance(sequence_or_dict, (dict, _Attrib)): 2456 sequence_or_dict = sequence_or_dict.items() 2457 for key, value in sequence_or_dict: 2458 _setAttributeValue(self._element, key, value) 2459 2460 def pop(self, key, *default): 2461 if len(default) > 1: 2462 raise TypeError, f"pop expected at most 2 arguments, got {len(default)+1}" 2463 _assertValidNode(self._element) 2464 result = _getAttributeValue(self._element, key, None) 2465 if result is None: 2466 if not default: 2467 raise KeyError, key 2468 result = default[0] 2469 else: 2470 _delAttribute(self._element, key) 2471 return result 2472 2473 def clear(self): 2474 _assertValidNode(self._element) 2475 c_attrs = self._element._c_node.properties 2476 if c_attrs: 2477 self._element._c_node.properties = NULL 2478 tree.xmlFreePropList(c_attrs) 2479 2480 # ACCESSORS 2481 def __repr__(self): 2482 _assertValidNode(self._element) 2483 return repr(dict( _collectAttributes(self._element._c_node, 3) )) 2484 2485 def __copy__(self): 2486 _assertValidNode(self._element) 2487 return dict(_collectAttributes(self._element._c_node, 3)) 2488 2489 def __deepcopy__(self, memo): 2490 _assertValidNode(self._element) 2491 return dict(_collectAttributes(self._element._c_node, 3)) 2492 2493 def __getitem__(self, key): 2494 _assertValidNode(self._element) 2495 result = _getAttributeValue(self._element, key, None) 2496 if result is None: 2497 raise KeyError, key 2498 return result 2499 2500 def __bool__(self): 2501 _assertValidNode(self._element) 2502 cdef xmlAttr* c_attr = self._element._c_node.properties 2503 while c_attr is not NULL: 2504 if c_attr.type == tree.XML_ATTRIBUTE_NODE: 2505 return 1 2506 c_attr = c_attr.next 2507 return 0 2508 2509 def __len__(self): 2510 _assertValidNode(self._element) 2511 cdef xmlAttr* c_attr = self._element._c_node.properties 2512 cdef Py_ssize_t c = 0 2513 while c_attr is not NULL: 2514 if c_attr.type == tree.XML_ATTRIBUTE_NODE: 2515 c += 1 2516 c_attr = c_attr.next 2517 return c 2518 2519 def get(self, key, default=None): 2520 _assertValidNode(self._element) 2521 return _getAttributeValue(self._element, key, default) 2522 2523 def keys(self): 2524 _assertValidNode(self._element) 2525 return _collectAttributes(self._element._c_node, 1) 2526 2527 def __iter__(self): 2528 _assertValidNode(self._element) 2529 return iter(_collectAttributes(self._element._c_node, 1)) 2530 2531 def iterkeys(self): 2532 _assertValidNode(self._element) 2533 return iter(_collectAttributes(self._element._c_node, 1)) 2534 2535 def values(self): 2536 _assertValidNode(self._element) 2537 return _collectAttributes(self._element._c_node, 2) 2538 2539 def itervalues(self): 2540 _assertValidNode(self._element) 2541 return iter(_collectAttributes(self._element._c_node, 2)) 2542 2543 def items(self): 2544 _assertValidNode(self._element) 2545 return _collectAttributes(self._element._c_node, 3) 2546 2547 def iteritems(self): 2548 _assertValidNode(self._element) 2549 return iter(_collectAttributes(self._element._c_node, 3)) 2550 2551 def has_key(self, key): 2552 _assertValidNode(self._element) 2553 return key in self 2554 2555 def __contains__(self, key): 2556 _assertValidNode(self._element) 2557 cdef xmlNode* c_node 2558 ns, tag = _getNsTag(key) 2559 c_node = self._element._c_node 2560 c_href = <const_xmlChar*>NULL if ns is None else _xcstr(ns) 2561 return 1 if tree.xmlHasNsProp(c_node, _xcstr(tag), c_href) else 0 2562 2563 def __richcmp__(self, other, int op): 2564 try: 2565 one = dict(self.items()) 2566 if not isinstance(other, dict): 2567 other = dict(other) 2568 except (TypeError, ValueError): 2569 return NotImplemented 2570 return python.PyObject_RichCompare(one, other, op) 2571 2572 MutableMapping.register(_Attrib) 2573 2574 2575 @cython.final 2576 @cython.internal 2577 cdef class _AttribIterator: 2578 u"""Attribute iterator - for internal use only! 2579 """ 2580 # XML attributes must not be removed while running! 2581 cdef _Element _node 2582 cdef xmlAttr* _c_attr 2583 cdef int _keysvalues # 1 - keys, 2 - values, 3 - items (key, value) 2584 def __iter__(self): 2585 return self 2586 2587 def __next__(self): 2588 cdef xmlAttr* c_attr 2589 if self._node is None: 2590 raise StopIteration 2591 c_attr = self._c_attr 2592 while c_attr is not NULL and c_attr.type != tree.XML_ATTRIBUTE_NODE: 2593 c_attr = c_attr.next 2594 if c_attr is NULL: 2595 self._node = None 2596 raise StopIteration 2597 2598 self._c_attr = c_attr.next 2599 if self._keysvalues == 1: 2600 return _namespacedName(<xmlNode*>c_attr) 2601 elif self._keysvalues == 2: 2602 return _attributeValue(self._node._c_node, c_attr) 2603 else: 2604 return (_namespacedName(<xmlNode*>c_attr), 2605 _attributeValue(self._node._c_node, c_attr)) 2606 2607 cdef object _attributeIteratorFactory(_Element element, int keysvalues): 2608 cdef _AttribIterator attribs 2609 if element._c_node.properties is NULL: 2610 return ITER_EMPTY 2611 attribs = _AttribIterator() 2612 attribs._node = element 2613 attribs._c_attr = element._c_node.properties 2614 attribs._keysvalues = keysvalues 2615 return attribs 2616 2617 2618 cdef public class _ElementTagMatcher [ object LxmlElementTagMatcher, 2619 type LxmlElementTagMatcherType ]: 2620 """ 2621 Dead but public. :) 2622 """ 2623 cdef object _pystrings 2624 cdef int _node_type 2625 cdef char* _href 2626 cdef char* _name 2627 cdef _initTagMatch(self, tag): 2628 self._href = NULL 2629 self._name = NULL 2630 if tag is None: 2631 self._node_type = 0 2632 elif tag is Comment: 2633 self._node_type = tree.XML_COMMENT_NODE 2634 elif tag is ProcessingInstruction: 2635 self._node_type = tree.XML_PI_NODE 2636 elif tag is Entity: 2637 self._node_type = tree.XML_ENTITY_REF_NODE 2638 elif tag is Element: 2639 self._node_type = tree.XML_ELEMENT_NODE 2640 else: 2641 self._node_type = tree.XML_ELEMENT_NODE 2642 self._pystrings = _getNsTag(tag) 2643 if self._pystrings[0] is not None: 2644 self._href = _cstr(self._pystrings[0]) 2645 self._name = _cstr(self._pystrings[1]) 2646 if self._name[0] == c'*' and self._name[1] == c'\0': 2647 self._name = NULL 2648 2649 cdef public class _ElementIterator(_ElementTagMatcher) [ 2650 object LxmlElementIterator, type LxmlElementIteratorType ]: 2651 """ 2652 Dead but public. :) 2653 """ 2654 # we keep Python references here to control GC 2655 cdef _Element _node 2656 cdef _node_to_node_function _next_element 2657 def __iter__(self): 2658 return self 2659 2660 cdef void _storeNext(self, _Element node): 2661 cdef xmlNode* c_node 2662 c_node = self._next_element(node._c_node) 2663 while c_node is not NULL and \ 2664 self._node_type != 0 and \ 2665 (<tree.xmlElementType>self._node_type != c_node.type or 2666 not _tagMatches(c_node, <const_xmlChar*>self._href, <const_xmlChar*>self._name)): 2667 c_node = self._next_element(c_node) 2668 if c_node is NULL: 2669 self._node = None 2670 else: 2671 # Python ref: 2672 self._node = _elementFactory(node._doc, c_node) 2673 2674 def __next__(self): 2675 cdef xmlNode* c_node 2676 cdef _Element current_node 2677 if self._node is None: 2678 raise StopIteration 2679 # Python ref: 2680 current_node = self._node 2681 self._storeNext(current_node) 2682 return current_node 2683 2684 @cython.final 2685 @cython.internal 2686 cdef class _MultiTagMatcher: 2687 """ 2688 Match an xmlNode against a list of tags. 2689 """ 2690 cdef list _py_tags 2691 cdef qname* _cached_tags 2692 cdef size_t _tag_count 2693 cdef size_t _cached_size 2694 cdef _Document _cached_doc 2695 cdef int _node_types 2696 2697 def __cinit__(self, tags): 2698 self._py_tags = [] 2699 self.initTagMatch(tags) 2700 2701 def __dealloc__(self): 2702 self._clear() 2703 2704 cdef bint rejectsAll(self): 2705 return not self._tag_count and not self._node_types 2706 2707 cdef bint rejectsAllAttributes(self): 2708 return not self._tag_count 2709 2710 cdef bint matchesType(self, int node_type): 2711 if node_type == tree.XML_ELEMENT_NODE and self._tag_count: 2712 return True 2713 return self._node_types & (1 << node_type) 2714 2715 cdef void _clear(self): 2716 cdef size_t i, count 2717 count = self._tag_count 2718 self._tag_count = 0 2719 if self._cached_tags: 2720 for i in xrange(count): 2721 cpython.ref.Py_XDECREF(self._cached_tags[i].href) 2722 python.lxml_free(self._cached_tags) 2723 self._cached_tags = NULL 2724 2725 cdef initTagMatch(self, tags): 2726 self._cached_doc = None 2727 del self._py_tags[:] 2728 self._clear() 2729 if tags is None or tags == (): 2730 # no selection in tags argument => match anything 2731 self._node_types = ( 2732 1 << tree.XML_COMMENT_NODE | 2733 1 << tree.XML_PI_NODE | 2734 1 << tree.XML_ENTITY_REF_NODE | 2735 1 << tree.XML_ELEMENT_NODE) 2736 else: 2737 self._node_types = 0 2738 self._storeTags(tags, set()) 2739 2740 cdef _storeTags(self, tag, set seen): 2741 if tag is Comment: 2742 self._node_types |= 1 << tree.XML_COMMENT_NODE 2743 elif tag is ProcessingInstruction: 2744 self._node_types |= 1 << tree.XML_PI_NODE 2745 elif tag is Entity: 2746 self._node_types |= 1 << tree.XML_ENTITY_REF_NODE 2747 elif tag is Element: 2748 self._node_types |= 1 << tree.XML_ELEMENT_NODE 2749 elif python._isString(tag): 2750 if tag in seen: 2751 return 2752 seen.add(tag) 2753 if tag in ('*', '{*}*'): 2754 self._node_types |= 1 << tree.XML_ELEMENT_NODE 2755 else: 2756 href, name = _getNsTag(tag) 2757 if name == b'*': 2758 name = None 2759 if href is None: 2760 href = b'' # no namespace 2761 elif href == b'*': 2762 href = None # wildcard: any namespace, including none 2763 self._py_tags.append((href, name)) 2764 elif isinstance(tag, QName): 2765 self._storeTags(tag.text, seen) 2766 else: 2767 # support a sequence of tags 2768 for item in tag: 2769 self._storeTags(item, seen) 2770 2771 cdef inline int cacheTags(self, _Document doc, bint force_into_dict=False) except -1: 2772 """ 2773 Look up the tag names in the doc dict to enable string pointer comparisons. 2774 """ 2775 cdef size_t dict_size = tree.xmlDictSize(doc._c_doc.dict) 2776 if doc is self._cached_doc and dict_size == self._cached_size: 2777 # doc and dict didn't change => names already cached 2778 return 0 2779 self._tag_count = 0 2780 if not self._py_tags: 2781 self._cached_doc = doc 2782 self._cached_size = dict_size 2783 return 0 2784 if not self._cached_tags: 2785 self._cached_tags = <qname*>python.lxml_malloc(len(self._py_tags), sizeof(qname)) 2786 if not self._cached_tags: 2787 self._cached_doc = None 2788 raise MemoryError() 2789 self._tag_count = <size_t>_mapTagsToQnameMatchArray( 2790 doc._c_doc, self._py_tags, self._cached_tags, force_into_dict) 2791 self._cached_doc = doc 2792 self._cached_size = dict_size 2793 return 0 2794 2795 cdef inline bint matches(self, xmlNode* c_node): 2796 cdef qname* c_qname 2797 if self._node_types & (1 << c_node.type): 2798 return True 2799 elif c_node.type == tree.XML_ELEMENT_NODE: 2800 for c_qname in self._cached_tags[:self._tag_count]: 2801 if _tagMatchesExactly(c_node, c_qname): 2802 return True 2803 return False 2804 2805 cdef inline bint matchesNsTag(self, const_xmlChar* c_href, 2806 const_xmlChar* c_name): 2807 cdef qname* c_qname 2808 if self._node_types & (1 << tree.XML_ELEMENT_NODE): 2809 return True 2810 for c_qname in self._cached_tags[:self._tag_count]: 2811 if _nsTagMatchesExactly(c_href, c_name, c_qname): 2812 return True 2813 return False 2814 2815 cdef inline bint matchesAttribute(self, xmlAttr* c_attr): 2816 """Attribute matches differ from Element matches in that they do 2817 not care about node types. 2818 """ 2819 cdef qname* c_qname 2820 for c_qname in self._cached_tags[:self._tag_count]: 2821 if _tagMatchesExactly(<xmlNode*>c_attr, c_qname): 2822 return True 2823 return False 2824 2825 cdef class _ElementMatchIterator: 2826 cdef _Element _node 2827 cdef _node_to_node_function _next_element 2828 cdef _MultiTagMatcher _matcher 2829 2830 @cython.final 2831 cdef _initTagMatcher(self, tags): 2832 self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tags) 2833 2834 def __iter__(self): 2835 return self 2836 2837 @cython.final 2838 cdef int _storeNext(self, _Element node) except -1: 2839 self._matcher.cacheTags(node._doc) 2840 c_node = self._next_element(node._c_node) 2841 while c_node is not NULL and not self._matcher.matches(c_node): 2842 c_node = self._next_element(c_node) 2843 # store Python ref to next node to make sure it's kept alive 2844 self._node = _elementFactory(node._doc, c_node) if c_node is not NULL else None 2845 return 0 2846 2847 def __next__(self): 2848 cdef _Element current_node = self._node 2849 if current_node is None: 2850 raise StopIteration 2851 self._storeNext(current_node) 2852 return current_node 2853 2854 cdef class ElementChildIterator(_ElementMatchIterator): 2855 u"""ElementChildIterator(self, node, tag=None, reversed=False) 2856 Iterates over the children of an element. 2857 """ 2858 def __cinit__(self, _Element node not None, tag=None, *, bint reversed=False): 2859 cdef xmlNode* c_node 2860 _assertValidNode(node) 2861 self._initTagMatcher(tag) 2862 if reversed: 2863 c_node = _findChildBackwards(node._c_node, 0) 2864 self._next_element = _previousElement 2865 else: 2866 c_node = _findChildForwards(node._c_node, 0) 2867 self._next_element = _nextElement 2868 self._matcher.cacheTags(node._doc) 2869 while c_node is not NULL and not self._matcher.matches(c_node): 2870 c_node = self._next_element(c_node) 2871 # store Python ref to next node to make sure it's kept alive 2872 self._node = _elementFactory(node._doc, c_node) if c_node is not NULL else None 2873 2874 cdef class SiblingsIterator(_ElementMatchIterator): 2875 u"""SiblingsIterator(self, node, tag=None, preceding=False) 2876 Iterates over the siblings of an element. 2877 2878 You can pass the boolean keyword ``preceding`` to specify the direction. 2879 """ 2880 def __cinit__(self, _Element node not None, tag=None, *, bint preceding=False): 2881 _assertValidNode(node) 2882 self._initTagMatcher(tag) 2883 if preceding: 2884 self._next_element = _previousElement 2885 else: 2886 self._next_element = _nextElement 2887 self._storeNext(node) 2888 2889 cdef class AncestorsIterator(_ElementMatchIterator): 2890 u"""AncestorsIterator(self, node, tag=None) 2891 Iterates over the ancestors of an element (from parent to parent). 2892 """ 2893 def __cinit__(self, _Element node not None, tag=None): 2894 _assertValidNode(node) 2895 self._initTagMatcher(tag) 2896 self._next_element = _parentElement 2897 self._storeNext(node) 2898 2899 cdef class ElementDepthFirstIterator: 2900 u"""ElementDepthFirstIterator(self, node, tag=None, inclusive=True) 2901 Iterates over an element and its sub-elements in document order (depth 2902 first pre-order). 2903 2904 Note that this also includes comments, entities and processing 2905 instructions. To filter them out, check if the ``tag`` property 2906 of the returned element is a string (i.e. not None and not a 2907 factory function), or pass the ``Element`` factory for the ``tag`` 2908 argument to receive only Elements. 2909 2910 If the optional ``tag`` argument is not None, the iterator returns only 2911 the elements that match the respective name and namespace. 2912 2913 The optional boolean argument 'inclusive' defaults to True and can be set 2914 to False to exclude the start element itself. 2915 2916 Note that the behaviour of this iterator is completely undefined if the 2917 tree it traverses is modified during iteration. 2918 """ 2919 # we keep Python references here to control GC 2920 # keep the next Element after the one we return, and the (s)top node 2921 cdef _Element _next_node 2922 cdef _Element _top_node 2923 cdef _MultiTagMatcher _matcher 2924 def __cinit__(self, _Element node not None, tag=None, *, bint inclusive=True): 2925 _assertValidNode(node) 2926 self._top_node = node 2927 self._next_node = node 2928 self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag) 2929 self._matcher.cacheTags(node._doc) 2930 if not inclusive or not self._matcher.matches(node._c_node): 2931 # find start node (this cannot raise StopIteration, self._next_node != None) 2932 next(self) 2933 2934 def __iter__(self): 2935 return self 2936 2937 def __next__(self): 2938 cdef xmlNode* c_node 2939 cdef _Element current_node = self._next_node 2940 if current_node is None: 2941 raise StopIteration 2942 c_node = current_node._c_node 2943 self._matcher.cacheTags(current_node._doc) 2944 if not self._matcher._tag_count: 2945 # no tag name was found in the dict => not in document either 2946 # try to match by node type 2947 c_node = self._nextNodeAnyTag(c_node) 2948 else: 2949 c_node = self._nextNodeMatchTag(c_node) 2950 if c_node is NULL: 2951 self._next_node = None 2952 else: 2953 self._next_node = _elementFactory(current_node._doc, c_node) 2954 return current_node 2955 2956 @cython.final 2957 cdef xmlNode* _nextNodeAnyTag(self, xmlNode* c_node): 2958 cdef int node_types = self._matcher._node_types 2959 if not node_types: 2960 return NULL 2961 tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0) 2962 if node_types & (1 << c_node.type): 2963 return c_node 2964 tree.END_FOR_EACH_ELEMENT_FROM(c_node) 2965 return NULL 2966 2967 @cython.final 2968 cdef xmlNode* _nextNodeMatchTag(self, xmlNode* c_node): 2969 tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0) 2970 if self._matcher.matches(c_node): 2971 return c_node 2972 tree.END_FOR_EACH_ELEMENT_FROM(c_node) 2973 return NULL 2974 2975 cdef class ElementTextIterator: 2976 u"""ElementTextIterator(self, element, tag=None, with_tail=True) 2977 Iterates over the text content of a subtree. 2978 2979 You can pass the ``tag`` keyword argument to restrict text content to a 2980 specific tag name. 2981 2982 You can set the ``with_tail`` keyword argument to ``False`` to skip over 2983 tail text (e.g. if you know that it's only whitespace from pretty-printing). 2984 """ 2985 cdef object _events 2986 cdef _Element _start_element 2987 def __cinit__(self, _Element element not None, tag=None, *, bint with_tail=True): 2988 _assertValidNode(element) 2989 if with_tail: 2990 events = (u"start", u"comment", u"pi", u"end") 2991 else: 2992 events = (u"start", u"comment", u"pi") 2993 self._start_element = element 2994 self._events = iterwalk(element, events=events, tag=tag) 2995 2996 def __iter__(self): 2997 return self 2998 2999 def __next__(self): 3000 cdef _Element element 3001 result = None 3002 while result is None: 3003 event, element = next(self._events) # raises StopIteration 3004 if event == u"start": 3005 result = element.text 3006 elif element is not self._start_element: 3007 result = element.tail 3008 return result 3009 3010 cdef xmlNode* _createElement(xmlDoc* c_doc, object name_utf) except NULL: 3011 cdef xmlNode* c_node 3012 c_node = tree.xmlNewDocNode(c_doc, NULL, _xcstr(name_utf), NULL) 3013 return c_node 3014 3015 cdef xmlNode* _createComment(xmlDoc* c_doc, const_xmlChar* text): 3016 cdef xmlNode* c_node 3017 c_node = tree.xmlNewDocComment(c_doc, text) 3018 return c_node 3019 3020 cdef xmlNode* _createPI(xmlDoc* c_doc, const_xmlChar* target, const_xmlChar* text): 3021 cdef xmlNode* c_node 3022 c_node = tree.xmlNewDocPI(c_doc, target, text) 3023 return c_node 3024 3025 cdef xmlNode* _createEntity(xmlDoc* c_doc, const_xmlChar* name): 3026 cdef xmlNode* c_node 3027 c_node = tree.xmlNewReference(c_doc, name) 3028 return c_node 3029 3030 # module-level API for ElementTree 3031 3032 def Element(_tag, attrib=None, nsmap=None, **_extra): 3033 u"""Element(_tag, attrib=None, nsmap=None, **_extra) 3034 3035 Element factory. This function returns an object implementing the 3036 Element interface. 3037 3038 Also look at the `_Element.makeelement()` and 3039 `_BaseParser.makeelement()` methods, which provide a faster way to 3040 create an Element within a specific document or parser context. 3041 """ 3042 return _makeElement(_tag, NULL, None, None, None, None, 3043 attrib, nsmap, _extra) 3044 3045 3046 def Comment(text=None): 3047 u"""Comment(text=None) 3048 3049 Comment element factory. This factory function creates a special element that will 3050 be serialized as an XML comment. 3051 """ 3052 cdef _Document doc 3053 cdef xmlNode* c_node 3054 cdef xmlDoc* c_doc 3055 3056 if text is None: 3057 text = b'' 3058 else: 3059 text = _utf8(text) 3060 if b'--' in text or text.endswith(b'-'): 3061 raise ValueError("Comment may not contain '--' or end with '-'") 3062 3063 c_doc = _newXMLDoc() 3064 doc = _documentFactory(c_doc, None) 3065 c_node = _createComment(c_doc, _xcstr(text)) 3066 tree.xmlAddChild(<xmlNode*>c_doc, c_node) 3067 return _elementFactory(doc, c_node) 3068 3069 3070 def ProcessingInstruction(target, text=None): 3071 u"""ProcessingInstruction(target, text=None) 3072 3073 ProcessingInstruction element factory. This factory function creates a 3074 special element that will be serialized as an XML processing instruction. 3075 """ 3076 cdef _Document doc 3077 cdef xmlNode* c_node 3078 cdef xmlDoc* c_doc 3079 3080 target = _utf8(target) 3081 _tagValidOrRaise(target) 3082 if target.lower() == b'xml': 3083 raise ValueError, f"Invalid PI name '{target}'" 3084 3085 if text is None: 3086 text = b'' 3087 else: 3088 text = _utf8(text) 3089 if b'?>' in text: 3090 raise ValueError, "PI text must not contain '?>'" 3091 3092 c_doc = _newXMLDoc() 3093 doc = _documentFactory(c_doc, None) 3094 c_node = _createPI(c_doc, _xcstr(target), _xcstr(text)) 3095 tree.xmlAddChild(<xmlNode*>c_doc, c_node) 3096 return _elementFactory(doc, c_node) 3097 3098 PI = ProcessingInstruction 3099 3100 3101 cdef class CDATA: 3102 u"""CDATA(data) 3103 3104 CDATA factory. This factory creates an opaque data object that 3105 can be used to set Element text. The usual way to use it is:: 3106 3107 >>> el = Element('content') 3108 >>> el.text = CDATA('a string') 3109 3110 >>> print(el.text) 3111 a string 3112 >>> print(tostring(el, encoding="unicode")) 3113 <content><![CDATA[a string]]></content> 3114 """ 3115 cdef bytes _utf8_data 3116 def __cinit__(self, data): 3117 _utf8_data = _utf8(data) 3118 if b']]>' in _utf8_data: 3119 raise ValueError, "']]>' not allowed inside CDATA" 3120 self._utf8_data = _utf8_data 3121 3122 3123 def Entity(name): 3124 u"""Entity(name) 3125 3126 Entity factory. This factory function creates a special element 3127 that will be serialized as an XML entity reference or character 3128 reference. Note, however, that entities will not be automatically 3129 declared in the document. A document that uses entity references 3130 requires a DTD to define the entities. 3131 """ 3132 cdef _Document doc 3133 cdef xmlNode* c_node 3134 cdef xmlDoc* c_doc 3135 name_utf = _utf8(name) 3136 c_name = _xcstr(name_utf) 3137 if c_name[0] == c'#': 3138 if not _characterReferenceIsValid(c_name + 1): 3139 raise ValueError, f"Invalid character reference: '{name}'" 3140 elif not _xmlNameIsValid(c_name): 3141 raise ValueError, f"Invalid entity reference: '{name}'" 3142 c_doc = _newXMLDoc() 3143 doc = _documentFactory(c_doc, None) 3144 c_node = _createEntity(c_doc, c_name) 3145 tree.xmlAddChild(<xmlNode*>c_doc, c_node) 3146 return _elementFactory(doc, c_node) 3147 3148 3149 def SubElement(_Element _parent not None, _tag, 3150 attrib=None, nsmap=None, **_extra): 3151 u"""SubElement(_parent, _tag, attrib=None, nsmap=None, **_extra) 3152 3153 Subelement factory. This function creates an element instance, and 3154 appends it to an existing element. 3155 """ 3156 return _makeSubElement(_parent, _tag, None, None, attrib, nsmap, _extra) 3157 3158 3159 def ElementTree(_Element element=None, *, file=None, _BaseParser parser=None): 3160 u"""ElementTree(element=None, file=None, parser=None) 3161 3162 ElementTree wrapper class. 3163 """ 3164 cdef xmlNode* c_next 3165 cdef xmlNode* c_node 3166 cdef xmlNode* c_node_copy 3167 cdef xmlDoc* c_doc 3168 cdef _ElementTree etree 3169 cdef _Document doc 3170 3171 if element is not None: 3172 doc = element._doc 3173 elif file is not None: 3174 try: 3175 doc = _parseDocument(file, parser, None) 3176 except _TargetParserResult as result_container: 3177 return result_container.result 3178 else: 3179 c_doc = _newXMLDoc() 3180 doc = _documentFactory(c_doc, parser) 3181 3182 return _elementTreeFactory(doc, element) 3183 3184 3185 def HTML(text, _BaseParser parser=None, *, base_url=None): 3186 u"""HTML(text, parser=None, base_url=None) 3187 3188 Parses an HTML document from a string constant. Returns the root 3189 node (or the result returned by a parser target). This function 3190 can be used to embed "HTML literals" in Python code. 3191 3192 To override the parser with a different ``HTMLParser`` you can pass it to 3193 the ``parser`` keyword argument. 3194 3195 The ``base_url`` keyword argument allows to set the original base URL of 3196 the document to support relative Paths when looking up external entities 3197 (DTD, XInclude, ...). 3198 """ 3199 cdef _Document doc 3200 if parser is None: 3201 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() 3202 if not isinstance(parser, HTMLParser): 3203 parser = __DEFAULT_HTML_PARSER 3204 try: 3205 doc = _parseMemoryDocument(text, base_url, parser) 3206 return doc.getroot() 3207 except _TargetParserResult as result_container: 3208 return result_container.result 3209 3210 3211 def XML(text, _BaseParser parser=None, *, base_url=None): 3212 u"""XML(text, parser=None, base_url=None) 3213 3214 Parses an XML document or fragment from a string constant. 3215 Returns the root node (or the result returned by a parser target). 3216 This function can be used to embed "XML literals" in Python code, 3217 like in 3218 3219 >>> root = XML("<root><test/></root>") 3220 >>> print(root.tag) 3221 root 3222 3223 To override the parser with a different ``XMLParser`` you can pass it to 3224 the ``parser`` keyword argument. 3225 3226 The ``base_url`` keyword argument allows to set the original base URL of 3227 the document to support relative Paths when looking up external entities 3228 (DTD, XInclude, ...). 3229 """ 3230 cdef _Document doc 3231 if parser is None: 3232 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() 3233 if not isinstance(parser, XMLParser): 3234 parser = __DEFAULT_XML_PARSER 3235 try: 3236 doc = _parseMemoryDocument(text, base_url, parser) 3237 return doc.getroot() 3238 except _TargetParserResult as result_container: 3239 return result_container.result 3240 3241 3242 def fromstring(text, _BaseParser parser=None, *, base_url=None): 3243 u"""fromstring(text, parser=None, base_url=None) 3244 3245 Parses an XML document or fragment from a string. Returns the 3246 root node (or the result returned by a parser target). 3247 3248 To override the default parser with a different parser you can pass it to 3249 the ``parser`` keyword argument. 3250 3251 The ``base_url`` keyword argument allows to set the original base URL of 3252 the document to support relative Paths when looking up external entities 3253 (DTD, XInclude, ...). 3254 """ 3255 cdef _Document doc 3256 try: 3257 doc = _parseMemoryDocument(text, base_url, parser) 3258 return doc.getroot() 3259 except _TargetParserResult as result_container: 3260 return result_container.result 3261 3262 3263 def fromstringlist(strings, _BaseParser parser=None): 3264 u"""fromstringlist(strings, parser=None) 3265 3266 Parses an XML document from a sequence of strings. Returns the 3267 root node (or the result returned by a parser target). 3268 3269 To override the default parser with a different parser you can pass it to 3270 the ``parser`` keyword argument. 3271 """ 3272 cdef _Document doc 3273 if isinstance(strings, (bytes, unicode)): 3274 raise ValueError("passing a single string into fromstringlist() is not" 3275 " efficient, use fromstring() instead") 3276 if parser is None: 3277 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() 3278 feed = parser.feed 3279 for data in strings: 3280 feed(data) 3281 return parser.close() 3282 3283 3284 def iselement(element): 3285 u"""iselement(element) 3286 3287 Checks if an object appears to be a valid element object. 3288 """ 3289 return isinstance(element, _Element) and (<_Element>element)._c_node is not NULL 3290 3291 3292 def indent(tree, space=" ", *, Py_ssize_t level=0): 3293 """indent(tree, space=" ", level=0) 3294 3295 Indent an XML document by inserting newlines and indentation space 3296 after elements. 3297 3298 *tree* is the ElementTree or Element to modify. The (root) element 3299 itself will not be changed, but the tail text of all elements in its 3300 subtree will be adapted. 3301 3302 *space* is the whitespace to insert for each indentation level, two 3303 space characters by default. 3304 3305 *level* is the initial indentation level. Setting this to a higher 3306 value than 0 can be used for indenting subtrees that are more deeply 3307 nested inside of a document. 3308 """ 3309 root = _rootNodeOrRaise(tree) 3310 if level < 0: 3311 raise ValueError(f"Initial indentation level must be >= 0, got {level}") 3312 if _hasChild(root._c_node): 3313 space = _utf8(space) 3314 indent = b"\n" + level * space 3315 _indent_children(root._c_node, 1, space, [indent, indent + space]) 3316 3317 3318 cdef int _indent_children(xmlNode* c_node, Py_ssize_t level, bytes one_space, list indentations) except -1: 3319 # Reuse indentation strings for speed. 3320 if len(indentations) <= level: 3321 indentations.append(indentations[-1] + one_space) 3322 3323 # Start a new indentation level for the first child. 3324 child_indentation = indentations[level] 3325 if not _hasNonWhitespaceText(c_node): 3326 _setNodeText(c_node, child_indentation) 3327 3328 # Recursively indent all children. 3329 cdef xmlNode* c_child = _findChildForwards(c_node, 0) 3330 while c_child is not NULL: 3331 if _hasChild(c_child): 3332 _indent_children(c_child, level+1, one_space, indentations) 3333 c_next_child = _nextElement(c_child) 3334 if not _hasNonWhitespaceTail(c_child): 3335 if c_next_child is NULL: 3336 # Dedent after the last child. 3337 child_indentation = indentations[level-1] 3338 _setTailText(c_child, child_indentation) 3339 c_child = c_next_child 3340 return 0 3341 3342 3343 def dump(_Element elem not None, *, bint pretty_print=True, with_tail=True): 3344 u"""dump(elem, pretty_print=True, with_tail=True) 3345 3346 Writes an element tree or element structure to sys.stdout. This function 3347 should be used for debugging only. 3348 """ 3349 xml = tostring(elem, pretty_print=pretty_print, with_tail=with_tail, 3350 encoding=None if python.IS_PYTHON2 else 'unicode') 3351 if not pretty_print: 3352 xml += '\n' 3353 sys.stdout.write(xml) 3354 3355 3356 def tostring(element_or_tree, *, encoding=None, method="xml", 3357 xml_declaration=None, bint pretty_print=False, bint with_tail=True, 3358 standalone=None, doctype=None, 3359 # method='c14n' 3360 bint exclusive=False, inclusive_ns_prefixes=None, 3361 # method='c14n2' 3362 bint with_comments=True, bint strip_text=False, 3363 ): 3364 u"""tostring(element_or_tree, encoding=None, method="xml", 3365 xml_declaration=None, pretty_print=False, with_tail=True, 3366 standalone=None, doctype=None, 3367 exclusive=False, inclusive_ns_prefixes=None, 3368 with_comments=True, strip_text=False, 3369 ) 3370 3371 Serialize an element to an encoded string representation of its XML 3372 tree. 3373 3374 Defaults to ASCII encoding without XML declaration. This 3375 behaviour can be configured with the keyword arguments 'encoding' 3376 (string) and 'xml_declaration' (bool). Note that changing the 3377 encoding to a non UTF-8 compatible encoding will enable a 3378 declaration by default. 3379 3380 You can also serialise to a Unicode string without declaration by 3381 passing the name ``'unicode'`` as encoding (or the ``str`` function 3382 in Py3 or ``unicode`` in Py2). This changes the return value from 3383 a byte string to an unencoded unicode string. 3384 3385 The keyword argument 'pretty_print' (bool) enables formatted XML. 3386 3387 The keyword argument 'method' selects the output method: 'xml', 3388 'html', plain 'text' (text content without tags), 'c14n' or 'c14n2'. 3389 Default is 'xml'. 3390 3391 With ``method="c14n"`` (C14N version 1), the options ``exclusive``, 3392 ``with_comments`` and ``inclusive_ns_prefixes`` request exclusive 3393 C14N, include comments, and list the inclusive prefixes respectively. 3394 3395 With ``method="c14n2"`` (C14N version 2), the ``with_comments`` and 3396 ``strip_text`` options control the output of comments and text space 3397 according to C14N 2.0. 3398 3399 Passing a boolean value to the ``standalone`` option will output 3400 an XML declaration with the corresponding ``standalone`` flag. 3401 3402 The ``doctype`` option allows passing in a plain string that will 3403 be serialised before the XML tree. Note that passing in non 3404 well-formed content here will make the XML output non well-formed. 3405 Also, an existing doctype in the document tree will not be removed 3406 when serialising an ElementTree instance. 3407 3408 You can prevent the tail text of the element from being serialised 3409 by passing the boolean ``with_tail`` option. This has no impact 3410 on the tail text of children, which will always be serialised. 3411 """ 3412 cdef bint write_declaration 3413 cdef int is_standalone 3414 # C14N serialisation 3415 if method in ('c14n', 'c14n2'): 3416 if encoding is not None: 3417 raise ValueError("Cannot specify encoding with C14N") 3418 if xml_declaration: 3419 raise ValueError("Cannot enable XML declaration in C14N") 3420 if method == 'c14n': 3421 return _tostringC14N(element_or_tree, exclusive, with_comments, inclusive_ns_prefixes) 3422 else: 3423 out = BytesIO() 3424 target = C14NWriterTarget( 3425 utf8_writer(out).write, 3426 with_comments=with_comments, strip_text=strip_text) 3427 _tree_to_target(element_or_tree, target) 3428 return out.getvalue() 3429 if not with_comments: 3430 raise ValueError("Can only discard comments in C14N serialisation") 3431 if strip_text: 3432 raise ValueError("Can only strip text in C14N 2.0 serialisation") 3433 if encoding is unicode or (encoding is not None and encoding.lower() == 'unicode'): 3434 if xml_declaration: 3435 raise ValueError, \ 3436 u"Serialisation to unicode must not request an XML declaration" 3437 write_declaration = 0 3438 encoding = unicode 3439 elif xml_declaration is None: 3440 # by default, write an XML declaration only for non-standard encodings 3441 write_declaration = encoding is not None and encoding.upper() not in \ 3442 (u'ASCII', u'UTF-8', u'UTF8', u'US-ASCII') 3443 else: 3444 write_declaration = xml_declaration 3445 if encoding is None: 3446 encoding = u'ASCII' 3447 if standalone is None: 3448 is_standalone = -1 3449 elif standalone: 3450 write_declaration = 1 3451 is_standalone = 1 3452 else: 3453 write_declaration = 1 3454 is_standalone = 0 3455 3456 if isinstance(element_or_tree, _Element): 3457 return _tostring(<_Element>element_or_tree, encoding, doctype, method, 3458 write_declaration, 0, pretty_print, with_tail, 3459 is_standalone) 3460 elif isinstance(element_or_tree, _ElementTree): 3461 return _tostring((<_ElementTree>element_or_tree)._context_node, 3462 encoding, doctype, method, write_declaration, 1, 3463 pretty_print, with_tail, is_standalone) 3464 else: 3465 raise TypeError, f"Type '{python._fqtypename(element_or_tree).decode('utf8')}' cannot be serialized." 3466 3467 3468 3469 def tostringlist(element_or_tree, *args, **kwargs): 3470 u"""tostringlist(element_or_tree, *args, **kwargs) 3471 3472 Serialize an element to an encoded string representation of its XML 3473 tree, stored in a list of partial strings. 3474 3475 This is purely for ElementTree 1.3 compatibility. The result is a 3476 single string wrapped in a list. 3477 """ 3478 return [tostring(element_or_tree, *args, **kwargs)] 3479 3480 3481 def tounicode(element_or_tree, *, method=u"xml", bint pretty_print=False, 3482 bint with_tail=True, doctype=None): 3483 u"""tounicode(element_or_tree, method="xml", pretty_print=False, 3484 with_tail=True, doctype=None) 3485 3486 Serialize an element to the Python unicode representation of its XML 3487 tree. 3488 3489 :deprecated: use ``tostring(el, encoding='unicode')`` instead. 3490 3491 Note that the result does not carry an XML encoding declaration and is 3492 therefore not necessarily suited for serialization to byte streams without 3493 further treatment. 3494 3495 The boolean keyword argument 'pretty_print' enables formatted XML. 3496 3497 The keyword argument 'method' selects the output method: 'xml', 3498 'html' or plain 'text'. 3499 3500 You can prevent the tail text of the element from being serialised 3501 by passing the boolean ``with_tail`` option. This has no impact 3502 on the tail text of children, which will always be serialised. 3503 """ 3504 if isinstance(element_or_tree, _Element): 3505 return _tostring(<_Element>element_or_tree, unicode, doctype, method, 3506 0, 0, pretty_print, with_tail, -1) 3507 elif isinstance(element_or_tree, _ElementTree): 3508 return _tostring((<_ElementTree>element_or_tree)._context_node, 3509 unicode, doctype, method, 0, 1, pretty_print, 3510 with_tail, -1) 3511 else: 3512 raise TypeError, f"Type '{type(element_or_tree)}' cannot be serialized." 3513 3514 3515 def parse(source, _BaseParser parser=None, *, base_url=None): 3516 u"""parse(source, parser=None, base_url=None) 3517 3518 Return an ElementTree object loaded with source elements. If no parser 3519 is provided as second argument, the default parser is used. 3520 3521 The ``source`` can be any of the following: 3522 3523 - a file name/path 3524 - a file object 3525 - a file-like object 3526 - a URL using the HTTP or FTP protocol 3527 3528 To parse from a string, use the ``fromstring()`` function instead. 3529 3530 Note that it is generally faster to parse from a file path or URL 3531 than from an open file object or file-like object. Transparent 3532 decompression from gzip compressed sources is supported (unless 3533 explicitly disabled in libxml2). 3534 3535 The ``base_url`` keyword allows setting a URL for the document 3536 when parsing from a file-like object. This is needed when looking 3537 up external entities (DTD, XInclude, ...) with relative paths. 3538 """ 3539 cdef _Document doc 3540 try: 3541 doc = _parseDocument(source, parser, base_url) 3542 return _elementTreeFactory(doc, None) 3543 except _TargetParserResult as result_container: 3544 return result_container.result 3545 3546 3547 def adopt_external_document(capsule, _BaseParser parser=None): 3548 """adopt_external_document(capsule, parser=None) 3549 3550 Unpack a libxml2 document pointer from a PyCapsule and wrap it in an 3551 lxml ElementTree object. 3552 3553 This allows external libraries to build XML/HTML trees using libxml2 3554 and then pass them efficiently into lxml for further processing. 3555 3556 If a ``parser`` is provided, it will be used for configuring the 3557 lxml document. No parsing will be done. 3558 3559 The capsule must have the name ``"libxml2:xmlDoc"`` and its pointer 3560 value must reference a correct libxml2 document of type ``xmlDoc*``. 3561 The creator of the capsule must take care to correctly clean up the 3562 document using an appropriate capsule destructor. By default, the 3563 libxml2 document will be copied to let lxml safely own the memory 3564 of the internal tree that it uses. 3565 3566 If the capsule context is non-NULL, it must point to a C string that 3567 can be compared using ``strcmp()``. If the context string equals 3568 ``"destructor:xmlFreeDoc"``, the libxml2 document will not be copied 3569 but the capsule invalidated instead by clearing its destructor and 3570 name. That way, lxml takes ownership of the libxml2 document in memory 3571 without creating a copy first, and the capsule destructor will not be 3572 called. The document will then eventually be cleaned up by lxml using 3573 the libxml2 API function ``xmlFreeDoc()`` once it is no longer used. 3574 3575 If no copy is made, later modifications of the tree outside of lxml 3576 should not be attempted after transferring the ownership. 3577 """ 3578 cdef xmlDoc* c_doc 3579 cdef bint is_owned = False 3580 c_doc = <xmlDoc*> python.lxml_unpack_xmldoc_capsule(capsule, &is_owned) 3581 doc = _adoptForeignDoc(c_doc, parser, is_owned) 3582 return _elementTreeFactory(doc, None) 3583 3584 3585 ################################################################################ 3586 # Include submodules 3587 3588 include "readonlytree.pxi" # Read-only implementation of Element proxies 3589 include "classlookup.pxi" # Element class lookup mechanisms 3590 include "nsclasses.pxi" # Namespace implementation and registry 3591 include "docloader.pxi" # Support for custom document loaders 3592 include "parser.pxi" # XML and HTML parsers 3593 include "saxparser.pxi" # SAX-like Parser interface and tree builder 3594 include "parsertarget.pxi" # ET Parser target 3595 include "serializer.pxi" # XML output functions 3596 include "iterparse.pxi" # incremental XML parsing 3597 include "xmlid.pxi" # XMLID and IDDict 3598 include "xinclude.pxi" # XInclude 3599 include "cleanup.pxi" # Cleanup and recursive element removal functions 3600 3601 3602 ################################################################################ 3603 # Include submodules for XPath and XSLT 3604 3605 include "extensions.pxi" # XPath/XSLT extension functions 3606 include "xpath.pxi" # XPath evaluation 3607 include "xslt.pxi" # XSL transformations 3608 include "xsltext.pxi" # XSL extension elements 3609 3610 3611 ################################################################################ 3612 # Validation 3613 3614 cdef class DocumentInvalid(LxmlError): 3615 """Validation error. 3616 3617 Raised by all document validators when their ``assertValid(tree)`` 3618 method fails. 3619 """ 3620 3621 3622 cdef class _Validator: 3623 u"Base class for XML validators." 3624 cdef _ErrorLog _error_log 3625 def __cinit__(self): 3626 self._error_log = _ErrorLog() 3627 3628 def validate(self, etree): 3629 u"""validate(self, etree) 3630 3631 Validate the document using this schema. 3632 3633 Returns true if document is valid, false if not. 3634 """ 3635 return self(etree) 3636 3637 def assertValid(self, etree): 3638 u"""assertValid(self, etree) 3639 3640 Raises `DocumentInvalid` if the document does not comply with the schema. 3641 """ 3642 if not self(etree): 3643 raise DocumentInvalid(self._error_log._buildExceptionMessage( 3644 u"Document does not comply with schema"), 3645 self._error_log) 3646 3647 def assert_(self, etree): 3648 u"""assert_(self, etree) 3649 3650 Raises `AssertionError` if the document does not comply with the schema. 3651 """ 3652 if not self(etree): 3653 raise AssertionError, self._error_log._buildExceptionMessage( 3654 u"Document does not comply with schema") 3655 3656 cpdef _append_log_message(self, int domain, int type, int level, int line, 3657 message, filename): 3658 self._error_log._receiveGeneric(domain, type, level, line, message, 3659 filename) 3660 3661 cpdef _clear_error_log(self): 3662 self._error_log.clear() 3663 3664 @property 3665 def error_log(self): 3666 """The log of validation errors and warnings.""" 3667 assert self._error_log is not None, "XPath evaluator not initialised" 3668 return self._error_log.copy() 3669 3670 include "dtd.pxi" # DTD 3671 include "relaxng.pxi" # RelaxNG 3672 include "xmlschema.pxi" # XMLSchema 3673 include "schematron.pxi" # Schematron (requires libxml2 2.6.21+) 3674 3675 ################################################################################ 3676 # Public C API 3677 3678 include "public-api.pxi" 3679 3680 ################################################################################ 3681 # Other stuff 3682 3683 include "debug.pxi"