apihelpers.pxi
1 # Private/public helper functions for API functions 2 3 from lxml.includes cimport uri 4 5 6 cdef void displayNode(xmlNode* c_node, indent): 7 # to help with debugging 8 cdef xmlNode* c_child 9 try: 10 print indent * u' ', <long>c_node 11 c_child = c_node.children 12 while c_child is not NULL: 13 displayNode(c_child, indent + 1) 14 c_child = c_child.next 15 finally: 16 return # swallow any exceptions 17 18 cdef inline int _assertValidNode(_Element element) except -1: 19 assert element._c_node is not NULL, u"invalid Element proxy at %s" % id(element) 20 21 cdef inline int _assertValidDoc(_Document doc) except -1: 22 assert doc._c_doc is not NULL, u"invalid Document proxy at %s" % id(doc) 23 24 cdef _Document _documentOrRaise(object input): 25 u"""Call this to get the document of a _Document, _ElementTree or _Element 26 object, or to raise an exception if it can't be determined. 27 28 Should be used in all API functions for consistency. 29 """ 30 cdef _Document doc 31 if isinstance(input, _ElementTree): 32 if (<_ElementTree>input)._context_node is not None: 33 doc = (<_ElementTree>input)._context_node._doc 34 else: 35 doc = None 36 elif isinstance(input, _Element): 37 doc = (<_Element>input)._doc 38 elif isinstance(input, _Document): 39 doc = <_Document>input 40 else: 41 raise TypeError, f"Invalid input object: {python._fqtypename(input).decode('utf8')}" 42 if doc is None: 43 raise ValueError, f"Input object has no document: {python._fqtypename(input).decode('utf8')}" 44 _assertValidDoc(doc) 45 return doc 46 47 cdef _Element _rootNodeOrRaise(object input): 48 u"""Call this to get the root node of a _Document, _ElementTree or 49 _Element object, or to raise an exception if it can't be determined. 50 51 Should be used in all API functions for consistency. 52 """ 53 cdef _Element node 54 if isinstance(input, _ElementTree): 55 node = (<_ElementTree>input)._context_node 56 elif isinstance(input, _Element): 57 node = <_Element>input 58 elif isinstance(input, _Document): 59 node = (<_Document>input).getroot() 60 else: 61 raise TypeError, f"Invalid input object: {python._fqtypename(input).decode('utf8')}" 62 if (node is None or not node._c_node or 63 node._c_node.type != tree.XML_ELEMENT_NODE): 64 raise ValueError, f"Input object is not an XML element: {python._fqtypename(input).decode('utf8')}" 65 _assertValidNode(node) 66 return node 67 68 cdef bint _isAncestorOrSame(xmlNode* c_ancestor, xmlNode* c_node): 69 while c_node: 70 if c_node is c_ancestor: 71 return True 72 c_node = c_node.parent 73 return False 74 75 cdef _Element _makeElement(tag, xmlDoc* c_doc, _Document doc, 76 _BaseParser parser, text, tail, attrib, nsmap, 77 dict extra_attrs): 78 u"""Create a new element and initialize text content, namespaces and 79 attributes. 80 81 This helper function will reuse as much of the existing document as 82 possible: 83 84 If 'parser' is None, the parser will be inherited from 'doc' or the 85 default parser will be used. 86 87 If 'doc' is None, 'c_doc' is used to create a new _Document and the new 88 element is made its root node. 89 90 If 'c_doc' is also NULL, a new xmlDoc will be created. 91 """ 92 cdef xmlNode* c_node 93 if doc is not None: 94 c_doc = doc._c_doc 95 ns_utf, name_utf = _getNsTag(tag) 96 if parser is not None and parser._for_html: 97 _htmlTagValidOrRaise(name_utf) 98 if c_doc is NULL: 99 c_doc = _newHTMLDoc() 100 else: 101 _tagValidOrRaise(name_utf) 102 if c_doc is NULL: 103 c_doc = _newXMLDoc() 104 c_node = _createElement(c_doc, name_utf) 105 if c_node is NULL: 106 if doc is None and c_doc is not NULL: 107 tree.xmlFreeDoc(c_doc) 108 raise MemoryError() 109 try: 110 if doc is None: 111 tree.xmlDocSetRootElement(c_doc, c_node) 112 doc = _documentFactory(c_doc, parser) 113 if text is not None: 114 _setNodeText(c_node, text) 115 if tail is not None: 116 _setTailText(c_node, tail) 117 # add namespaces to node if necessary 118 _setNodeNamespaces(c_node, doc, ns_utf, nsmap) 119 _initNodeAttributes(c_node, doc, attrib, extra_attrs) 120 return _elementFactory(doc, c_node) 121 except: 122 # free allocated c_node/c_doc unless Python does it for us 123 if c_node.doc is not c_doc: 124 # node not yet in document => will not be freed by document 125 if tail is not None: 126 _removeText(c_node.next) # tail 127 tree.xmlFreeNode(c_node) 128 if doc is None: 129 # c_doc will not be freed by doc 130 tree.xmlFreeDoc(c_doc) 131 raise 132 133 cdef int _initNewElement(_Element element, bint is_html, name_utf, ns_utf, 134 _BaseParser parser, attrib, nsmap, dict extra_attrs) except -1: 135 u"""Initialise a new Element object. 136 137 This is used when users instantiate a Python Element subclass 138 directly, without it being mapped to an existing XML node. 139 """ 140 cdef xmlDoc* c_doc 141 cdef xmlNode* c_node 142 cdef _Document doc 143 if is_html: 144 _htmlTagValidOrRaise(name_utf) 145 c_doc = _newHTMLDoc() 146 else: 147 _tagValidOrRaise(name_utf) 148 c_doc = _newXMLDoc() 149 c_node = _createElement(c_doc, name_utf) 150 if c_node is NULL: 151 if c_doc is not NULL: 152 tree.xmlFreeDoc(c_doc) 153 raise MemoryError() 154 tree.xmlDocSetRootElement(c_doc, c_node) 155 doc = _documentFactory(c_doc, parser) 156 # add namespaces to node if necessary 157 _setNodeNamespaces(c_node, doc, ns_utf, nsmap) 158 _initNodeAttributes(c_node, doc, attrib, extra_attrs) 159 _registerProxy(element, doc, c_node) 160 element._init() 161 return 0 162 163 cdef _Element _makeSubElement(_Element parent, tag, text, tail, 164 attrib, nsmap, dict extra_attrs): 165 u"""Create a new child element and initialize text content, namespaces and 166 attributes. 167 """ 168 cdef xmlNode* c_node 169 cdef xmlDoc* c_doc 170 if parent is None or parent._doc is None: 171 return None 172 _assertValidNode(parent) 173 ns_utf, name_utf = _getNsTag(tag) 174 c_doc = parent._doc._c_doc 175 176 if parent._doc._parser is not None and parent._doc._parser._for_html: 177 _htmlTagValidOrRaise(name_utf) 178 else: 179 _tagValidOrRaise(name_utf) 180 181 c_node = _createElement(c_doc, name_utf) 182 if c_node is NULL: 183 raise MemoryError() 184 tree.xmlAddChild(parent._c_node, c_node) 185 186 try: 187 if text is not None: 188 _setNodeText(c_node, text) 189 if tail is not None: 190 _setTailText(c_node, tail) 191 192 # add namespaces to node if necessary 193 _setNodeNamespaces(c_node, parent._doc, ns_utf, nsmap) 194 _initNodeAttributes(c_node, parent._doc, attrib, extra_attrs) 195 return _elementFactory(parent._doc, c_node) 196 except: 197 # make sure we clean up in case of an error 198 _removeNode(parent._doc, c_node) 199 raise 200 201 202 cdef int _setNodeNamespaces(xmlNode* c_node, _Document doc, 203 object node_ns_utf, object nsmap) except -1: 204 u"""Lookup current namespace prefixes, then set namespace structure for 205 node (if 'node_ns_utf' was provided) and register new ns-prefix mappings. 206 207 'node_ns_utf' should only be passed for a newly created node. 208 """ 209 cdef xmlNs* c_ns 210 cdef list nsdefs 211 212 if nsmap: 213 for prefix, href in _iter_nsmap(nsmap): 214 href_utf = _utf8(href) 215 _uriValidOrRaise(href_utf) 216 c_href = _xcstr(href_utf) 217 if prefix is not None: 218 prefix_utf = _utf8(prefix) 219 _prefixValidOrRaise(prefix_utf) 220 c_prefix = _xcstr(prefix_utf) 221 else: 222 c_prefix = <const_xmlChar*>NULL 223 # add namespace with prefix if it is not already known 224 c_ns = tree.xmlSearchNs(doc._c_doc, c_node, c_prefix) 225 if c_ns is NULL or \ 226 c_ns.href is NULL or \ 227 tree.xmlStrcmp(c_ns.href, c_href) != 0: 228 c_ns = tree.xmlNewNs(c_node, c_href, c_prefix) 229 if href_utf == node_ns_utf: 230 tree.xmlSetNs(c_node, c_ns) 231 node_ns_utf = None 232 233 if node_ns_utf is not None: 234 _uriValidOrRaise(node_ns_utf) 235 doc._setNodeNs(c_node, _xcstr(node_ns_utf)) 236 return 0 237 238 239 cdef dict _build_nsmap(xmlNode* c_node): 240 """ 241 Namespace prefix->URI mapping known in the context of this Element. 242 This includes all namespace declarations of the parents. 243 """ 244 cdef xmlNs* c_ns 245 nsmap = {} 246 while c_node is not NULL and c_node.type == tree.XML_ELEMENT_NODE: 247 c_ns = c_node.nsDef 248 while c_ns is not NULL: 249 if c_ns.prefix or c_ns.href: 250 prefix = funicodeOrNone(c_ns.prefix) 251 if prefix not in nsmap: 252 nsmap[prefix] = funicodeOrNone(c_ns.href) 253 c_ns = c_ns.next 254 c_node = c_node.parent 255 return nsmap 256 257 258 cdef _iter_nsmap(nsmap): 259 """ 260 Create a reproducibly ordered iterable from an nsmap mapping. 261 Tries to preserve an existing order and sorts if it assumes no order. 262 263 The difference to _iter_attrib() is that None doesn't sort with strings 264 in Py3.x. 265 """ 266 if python.PY_VERSION_HEX >= 0x03060000: 267 # dicts are insertion-ordered in Py3.6+ => keep the user provided order. 268 if isinstance(nsmap, dict): 269 return nsmap.items() 270 if len(nsmap) <= 1: 271 return nsmap.items() 272 # nsmap will usually be a plain unordered dict => avoid type checking overhead 273 if type(nsmap) is not dict and isinstance(nsmap, OrderedDict): 274 return nsmap.items() # keep existing order 275 if None not in nsmap: 276 return sorted(nsmap.items()) 277 278 # Move the default namespace to the end. This makes sure libxml2 279 # prefers a prefix if the ns is defined redundantly on the same 280 # element. That way, users can work around a problem themselves 281 # where default namespace attributes on non-default namespaced 282 # elements serialise without prefix (i.e. into the non-default 283 # namespace). 284 default_ns = nsmap[None] 285 nsdefs = [(k, v) for k, v in nsmap.items() if k is not None] 286 nsdefs.sort() 287 nsdefs.append((None, default_ns)) 288 return nsdefs 289 290 291 cdef _iter_attrib(attrib): 292 """ 293 Create a reproducibly ordered iterable from an attrib mapping. 294 Tries to preserve an existing order and sorts if it assumes no order. 295 """ 296 # dicts are insertion-ordered in Py3.6+ => keep the user provided order. 297 if python.PY_VERSION_HEX >= 0x03060000 and isinstance(attrib, dict) or ( 298 isinstance(attrib, (_Attrib, OrderedDict))): 299 return attrib.items() 300 # assume it's an unordered mapping of some kind 301 return sorted(attrib.items()) 302 303 304 cdef _initNodeAttributes(xmlNode* c_node, _Document doc, attrib, dict extra): 305 u"""Initialise the attributes of an element node. 306 """ 307 cdef bint is_html 308 cdef xmlNs* c_ns 309 if attrib is not None and not hasattr(attrib, u'items'): 310 raise TypeError, f"Invalid attribute dictionary: {python._fqtypename(attrib).decode('utf8')}" 311 if not attrib and not extra: 312 return # nothing to do 313 is_html = doc._parser._for_html 314 seen = set() 315 if extra: 316 if python.PY_VERSION_HEX >= 0x03060000: 317 for name, value in extra.items(): 318 _addAttributeToNode(c_node, doc, is_html, name, value, seen) 319 else: 320 for name, value in sorted(extra.items()): 321 _addAttributeToNode(c_node, doc, is_html, name, value, seen) 322 if attrib: 323 for name, value in _iter_attrib(attrib): 324 _addAttributeToNode(c_node, doc, is_html, name, value, seen) 325 326 327 cdef int _addAttributeToNode(xmlNode* c_node, _Document doc, bint is_html, 328 name, value, set seen_tags) except -1: 329 ns_utf, name_utf = tag = _getNsTag(name) 330 if tag in seen_tags: 331 return 0 332 seen_tags.add(tag) 333 if not is_html: 334 _attributeValidOrRaise(name_utf) 335 value_utf = _utf8(value) 336 if ns_utf is None: 337 tree.xmlNewProp(c_node, _xcstr(name_utf), _xcstr(value_utf)) 338 else: 339 _uriValidOrRaise(ns_utf) 340 c_ns = doc._findOrBuildNodeNs(c_node, _xcstr(ns_utf), NULL, 1) 341 tree.xmlNewNsProp(c_node, c_ns, 342 _xcstr(name_utf), _xcstr(value_utf)) 343 return 0 344 345 346 ctypedef struct _ns_node_ref: 347 xmlNs* ns 348 xmlNode* node 349 350 351 cdef int _collectNsDefs(xmlNode* c_element, _ns_node_ref **_c_ns_list, 352 size_t *_c_ns_list_len, size_t *_c_ns_list_size) except -1: 353 c_ns_list = _c_ns_list[0] 354 cdef size_t c_ns_list_len = _c_ns_list_len[0] 355 cdef size_t c_ns_list_size = _c_ns_list_size[0] 356 357 c_nsdef = c_element.nsDef 358 while c_nsdef is not NULL: 359 if c_ns_list_len >= c_ns_list_size: 360 if c_ns_list is NULL: 361 c_ns_list_size = 20 362 else: 363 c_ns_list_size *= 2 364 c_nsref_ptr = <_ns_node_ref*> python.lxml_realloc( 365 c_ns_list, c_ns_list_size, sizeof(_ns_node_ref)) 366 if c_nsref_ptr is NULL: 367 if c_ns_list is not NULL: 368 python.lxml_free(c_ns_list) 369 _c_ns_list[0] = NULL 370 raise MemoryError() 371 c_ns_list = c_nsref_ptr 372 373 c_ns_list[c_ns_list_len] = _ns_node_ref(c_nsdef, c_element) 374 c_ns_list_len += 1 375 c_nsdef = c_nsdef.next 376 377 _c_ns_list_size[0] = c_ns_list_size 378 _c_ns_list_len[0] = c_ns_list_len 379 _c_ns_list[0] = c_ns_list 380 381 382 cdef int _removeUnusedNamespaceDeclarations(xmlNode* c_element, set prefixes_to_keep) except -1: 383 u"""Remove any namespace declarations from a subtree that are not used by 384 any of its elements (or attributes). 385 386 If a 'prefixes_to_keep' is provided, it must be a set of prefixes. 387 Any corresponding namespace mappings will not be removed as part of the cleanup. 388 """ 389 cdef xmlNode* c_node 390 cdef _ns_node_ref* c_ns_list = NULL 391 cdef size_t c_ns_list_size = 0 392 cdef size_t c_ns_list_len = 0 393 cdef size_t i 394 395 if c_element.parent and c_element.parent.type == tree.XML_DOCUMENT_NODE: 396 # include declarations on the document node 397 _collectNsDefs(c_element.parent, &c_ns_list, &c_ns_list_len, &c_ns_list_size) 398 399 tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_element, c_element, 1) 400 # collect all new namespace declarations into the ns list 401 if c_element.nsDef: 402 _collectNsDefs(c_element, &c_ns_list, &c_ns_list_len, &c_ns_list_size) 403 404 # remove all namespace declarations from the list that are referenced 405 if c_ns_list_len and c_element.type == tree.XML_ELEMENT_NODE: 406 c_node = c_element 407 while c_node and c_ns_list_len: 408 if c_node.ns: 409 for i in range(c_ns_list_len): 410 if c_node.ns is c_ns_list[i].ns: 411 c_ns_list_len -= 1 412 c_ns_list[i] = c_ns_list[c_ns_list_len] 413 #c_ns_list[c_ns_list_len] = _ns_node_ref(NULL, NULL) 414 break 415 if c_node is c_element: 416 # continue with attributes 417 c_node = <xmlNode*>c_element.properties 418 else: 419 c_node = c_node.next 420 tree.END_FOR_EACH_ELEMENT_FROM(c_element) 421 422 if c_ns_list is NULL: 423 return 0 424 425 # free all namespace declarations that remained in the list, 426 # except for those we should keep explicitly 427 cdef xmlNs* c_nsdef 428 for i in range(c_ns_list_len): 429 if prefixes_to_keep is not None: 430 if c_ns_list[i].ns.prefix and c_ns_list[i].ns.prefix in prefixes_to_keep: 431 continue 432 c_node = c_ns_list[i].node 433 c_nsdef = c_node.nsDef 434 if c_nsdef is c_ns_list[i].ns: 435 c_node.nsDef = c_node.nsDef.next 436 else: 437 while c_nsdef.next is not c_ns_list[i].ns: 438 c_nsdef = c_nsdef.next 439 c_nsdef.next = c_nsdef.next.next 440 tree.xmlFreeNs(c_ns_list[i].ns) 441 442 if c_ns_list is not NULL: 443 python.lxml_free(c_ns_list) 444 return 0 445 446 cdef xmlNs* _searchNsByHref(xmlNode* c_node, const_xmlChar* c_href, bint is_attribute): 447 u"""Search a namespace declaration that covers a node (element or 448 attribute). 449 450 For attributes, try to find a prefixed namespace declaration 451 instead of the default namespaces. This helps in supporting 452 round-trips for attributes on elements with a different namespace. 453 """ 454 cdef xmlNs* c_ns 455 cdef xmlNs* c_default_ns = NULL 456 cdef xmlNode* c_element 457 if c_href is NULL or c_node is NULL or c_node.type == tree.XML_ENTITY_REF_NODE: 458 return NULL 459 if tree.xmlStrcmp(c_href, tree.XML_XML_NAMESPACE) == 0: 460 # no special cases here, let libxml2 handle this 461 return tree.xmlSearchNsByHref(c_node.doc, c_node, c_href) 462 if c_node.type == tree.XML_ATTRIBUTE_NODE: 463 is_attribute = 1 464 while c_node is not NULL and c_node.type != tree.XML_ELEMENT_NODE: 465 c_node = c_node.parent 466 c_element = c_node 467 while c_node is not NULL: 468 if c_node.type == tree.XML_ELEMENT_NODE: 469 c_ns = c_node.nsDef 470 while c_ns is not NULL: 471 if c_ns.href is not NULL and tree.xmlStrcmp(c_href, c_ns.href) == 0: 472 if c_ns.prefix is NULL and is_attribute: 473 # for attributes, continue searching a named 474 # prefix, but keep the first default namespace 475 # declaration that we found 476 if c_default_ns is NULL: 477 c_default_ns = c_ns 478 elif tree.xmlSearchNs( 479 c_element.doc, c_element, c_ns.prefix) is c_ns: 480 # start node is in namespace scope => found! 481 return c_ns 482 c_ns = c_ns.next 483 if c_node is not c_element and c_node.ns is not NULL: 484 # optimise: the node may have the namespace itself 485 c_ns = c_node.ns 486 if c_ns.href is not NULL and tree.xmlStrcmp(c_href, c_ns.href) == 0: 487 if c_ns.prefix is NULL and is_attribute: 488 # for attributes, continue searching a named 489 # prefix, but keep the first default namespace 490 # declaration that we found 491 if c_default_ns is NULL: 492 c_default_ns = c_ns 493 elif tree.xmlSearchNs( 494 c_element.doc, c_element, c_ns.prefix) is c_ns: 495 # start node is in namespace scope => found! 496 return c_ns 497 c_node = c_node.parent 498 # nothing found => use a matching default namespace or fail 499 if c_default_ns is not NULL: 500 if tree.xmlSearchNs(c_element.doc, c_element, NULL) is c_default_ns: 501 return c_default_ns 502 return NULL 503 504 cdef int _replaceNodeByChildren(_Document doc, xmlNode* c_node) except -1: 505 # NOTE: this does not deallocate the node, just unlink it! 506 cdef xmlNode* c_parent 507 cdef xmlNode* c_child 508 if c_node.children is NULL: 509 tree.xmlUnlinkNode(c_node) 510 return 0 511 512 c_parent = c_node.parent 513 # fix parent links of children 514 c_child = c_node.children 515 while c_child is not NULL: 516 c_child.parent = c_parent 517 c_child = c_child.next 518 519 # fix namespace references of children if their parent's namespace 520 # declarations get lost 521 if c_node.nsDef is not NULL: 522 c_child = c_node.children 523 while c_child is not NULL: 524 moveNodeToDocument(doc, doc._c_doc, c_child) 525 c_child = c_child.next 526 527 # fix sibling links to/from child slice 528 if c_node.prev is NULL: 529 c_parent.children = c_node.children 530 else: 531 c_node.prev.next = c_node.children 532 c_node.children.prev = c_node.prev 533 if c_node.next is NULL: 534 c_parent.last = c_node.last 535 else: 536 c_node.next.prev = c_node.last 537 c_node.last.next = c_node.next 538 539 # unlink c_node 540 c_node.children = c_node.last = NULL 541 c_node.parent = c_node.next = c_node.prev = NULL 542 return 0 543 544 cdef object _attributeValue(xmlNode* c_element, xmlAttr* c_attrib_node): 545 c_href = _getNs(<xmlNode*>c_attrib_node) 546 value = tree.xmlGetNsProp(c_element, c_attrib_node.name, c_href) 547 try: 548 result = funicode(value) 549 finally: 550 tree.xmlFree(value) 551 return result 552 553 cdef object _attributeValueFromNsName(xmlNode* c_element, 554 const_xmlChar* c_href, const_xmlChar* c_name): 555 c_result = tree.xmlGetNsProp(c_element, c_name, c_href) 556 if c_result is NULL: 557 return None 558 try: 559 result = funicode(c_result) 560 finally: 561 tree.xmlFree(c_result) 562 return result 563 564 cdef object _getNodeAttributeValue(xmlNode* c_node, key, default): 565 ns, tag = _getNsTag(key) 566 c_href = <const_xmlChar*>NULL if ns is None else _xcstr(ns) 567 c_result = tree.xmlGetNsProp(c_node, _xcstr(tag), c_href) 568 if c_result is NULL: 569 # XXX free namespace that is not in use..? 570 return default 571 try: 572 result = funicode(c_result) 573 finally: 574 tree.xmlFree(c_result) 575 return result 576 577 cdef inline object _getAttributeValue(_Element element, key, default): 578 return _getNodeAttributeValue(element._c_node, key, default) 579 580 cdef int _setAttributeValue(_Element element, key, value) except -1: 581 cdef const_xmlChar* c_value 582 cdef xmlNs* c_ns 583 ns, tag = _getNsTag(key) 584 is_html = element._doc._parser._for_html 585 if not is_html: 586 _attributeValidOrRaise(tag) 587 c_tag = _xcstr(tag) 588 if value is None and is_html: 589 c_value = NULL 590 else: 591 if isinstance(value, QName): 592 value = _resolveQNameText(element, value) 593 else: 594 value = _utf8(value) 595 c_value = _xcstr(value) 596 if ns is None: 597 c_ns = NULL 598 else: 599 c_ns = element._doc._findOrBuildNodeNs(element._c_node, _xcstr(ns), NULL, 1) 600 tree.xmlSetNsProp(element._c_node, c_ns, c_tag, c_value) 601 return 0 602 603 cdef int _delAttribute(_Element element, key) except -1: 604 ns, tag = _getNsTag(key) 605 c_href = <const_xmlChar*>NULL if ns is None else _xcstr(ns) 606 if _delAttributeFromNsName(element._c_node, c_href, _xcstr(tag)): 607 raise KeyError, key 608 return 0 609 610 cdef int _delAttributeFromNsName(xmlNode* c_node, const_xmlChar* c_href, const_xmlChar* c_name): 611 c_attr = tree.xmlHasNsProp(c_node, c_name, c_href) 612 if c_attr is NULL: 613 # XXX free namespace that is not in use..? 614 return -1 615 tree.xmlRemoveProp(c_attr) 616 return 0 617 618 cdef list _collectAttributes(xmlNode* c_node, int collecttype): 619 u"""Collect all attributes of a node in a list. Depending on collecttype, 620 it collects either the name (1), the value (2) or the name-value tuples. 621 """ 622 cdef Py_ssize_t count 623 c_attr = c_node.properties 624 count = 0 625 while c_attr is not NULL: 626 if c_attr.type == tree.XML_ATTRIBUTE_NODE: 627 count += 1 628 c_attr = c_attr.next 629 630 if not count: 631 return [] 632 633 attributes = [None] * count 634 c_attr = c_node.properties 635 count = 0 636 while c_attr is not NULL: 637 if c_attr.type == tree.XML_ATTRIBUTE_NODE: 638 if collecttype == 1: 639 item = _namespacedName(<xmlNode*>c_attr) 640 elif collecttype == 2: 641 item = _attributeValue(c_node, c_attr) 642 else: 643 item = (_namespacedName(<xmlNode*>c_attr), 644 _attributeValue(c_node, c_attr)) 645 attributes[count] = item 646 count += 1 647 c_attr = c_attr.next 648 return attributes 649 650 cdef object __RE_XML_ENCODING = re.compile( 651 ur'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U) 652 653 cdef object __REPLACE_XML_ENCODING = __RE_XML_ENCODING.sub 654 cdef object __HAS_XML_ENCODING = __RE_XML_ENCODING.match 655 656 cdef object _stripEncodingDeclaration(object xml_string): 657 # this is a hack to remove the XML encoding declaration from unicode 658 return __REPLACE_XML_ENCODING(ur'\g<1>\g<2>', xml_string) 659 660 cdef bint _hasEncodingDeclaration(object xml_string) except -1: 661 # check if a (unicode) string has an XML encoding declaration 662 return __HAS_XML_ENCODING(xml_string) is not None 663 664 cdef inline bint _hasText(xmlNode* c_node): 665 return c_node is not NULL and _textNodeOrSkip(c_node.children) is not NULL 666 667 cdef inline bint _hasTail(xmlNode* c_node): 668 return c_node is not NULL and _textNodeOrSkip(c_node.next) is not NULL 669 670 cdef inline bint _hasNonWhitespaceTail(xmlNode* c_node): 671 return _hasNonWhitespaceText(c_node, tail=True) 672 673 cdef bint _hasNonWhitespaceText(xmlNode* c_node, bint tail=False): 674 c_text_node = c_node and _textNodeOrSkip(c_node.next if tail else c_node.children) 675 if c_text_node is NULL: 676 return False 677 while c_text_node is not NULL: 678 if c_text_node.content[0] != c'\0' and not _collectText(c_text_node).isspace(): 679 return True 680 c_text_node = _textNodeOrSkip(c_text_node.next) 681 return False 682 683 cdef _collectText(xmlNode* c_node): 684 u"""Collect all text nodes and return them as a unicode string. 685 686 Start collecting at c_node. 687 688 If there was no text to collect, return None 689 """ 690 cdef Py_ssize_t scount 691 cdef xmlChar* c_text 692 cdef xmlNode* c_node_cur 693 # check for multiple text nodes 694 scount = 0 695 c_text = NULL 696 c_node_cur = c_node = _textNodeOrSkip(c_node) 697 while c_node_cur is not NULL: 698 if c_node_cur.content[0] != c'\0': 699 c_text = c_node_cur.content 700 scount += 1 701 c_node_cur = _textNodeOrSkip(c_node_cur.next) 702 703 # handle two most common cases first 704 if c_text is NULL: 705 return '' if scount > 0 else None 706 if scount == 1: 707 return funicode(c_text) 708 709 # the rest is not performance critical anymore 710 result = b'' 711 while c_node is not NULL: 712 result += <unsigned char*>c_node.content 713 c_node = _textNodeOrSkip(c_node.next) 714 return funicode(<const_xmlChar*><unsigned char*>result) 715 716 cdef void _removeText(xmlNode* c_node): 717 u"""Remove all text nodes. 718 719 Start removing at c_node. 720 """ 721 cdef xmlNode* c_next 722 c_node = _textNodeOrSkip(c_node) 723 while c_node is not NULL: 724 c_next = _textNodeOrSkip(c_node.next) 725 tree.xmlUnlinkNode(c_node) 726 tree.xmlFreeNode(c_node) 727 c_node = c_next 728 729 cdef xmlNode* _createTextNode(xmlDoc* doc, value) except NULL: 730 cdef xmlNode* c_text_node 731 if isinstance(value, CDATA): 732 c_text_node = tree.xmlNewCDataBlock( 733 doc, _xcstr((<CDATA>value)._utf8_data), 734 python.PyBytes_GET_SIZE((<CDATA>value)._utf8_data)) 735 else: 736 text = _utf8(value) 737 c_text_node = tree.xmlNewDocText(doc, _xcstr(text)) 738 if not c_text_node: 739 raise MemoryError() 740 return c_text_node 741 742 cdef int _setNodeText(xmlNode* c_node, value) except -1: 743 # remove all text nodes at the start first 744 _removeText(c_node.children) 745 if value is None: 746 return 0 747 # now add new text node with value at start 748 c_text_node = _createTextNode(c_node.doc, value) 749 if c_node.children is NULL: 750 tree.xmlAddChild(c_node, c_text_node) 751 else: 752 tree.xmlAddPrevSibling(c_node.children, c_text_node) 753 return 0 754 755 cdef int _setTailText(xmlNode* c_node, value) except -1: 756 # remove all text nodes at the start first 757 _removeText(c_node.next) 758 if value is None: 759 return 0 760 # now append new text node with value 761 c_text_node = _createTextNode(c_node.doc, value) 762 tree.xmlAddNextSibling(c_node, c_text_node) 763 return 0 764 765 cdef bytes _resolveQNameText(_Element element, value): 766 cdef xmlNs* c_ns 767 ns, tag = _getNsTag(value) 768 if ns is None: 769 return tag 770 else: 771 c_ns = element._doc._findOrBuildNodeNs( 772 element._c_node, _xcstr(ns), NULL, 0) 773 return python.PyBytes_FromFormat('%s:%s', c_ns.prefix, _cstr(tag)) 774 775 cdef inline bint _hasChild(xmlNode* c_node): 776 return c_node is not NULL and _findChildForwards(c_node, 0) is not NULL 777 778 cdef inline Py_ssize_t _countElements(xmlNode* c_node): 779 u"Counts the elements within the following siblings and the node itself." 780 cdef Py_ssize_t count 781 count = 0 782 while c_node is not NULL: 783 if _isElement(c_node): 784 count += 1 785 c_node = c_node.next 786 return count 787 788 cdef int _findChildSlice( 789 slice sliceobject, xmlNode* c_parent, 790 xmlNode** c_start_node, Py_ssize_t* c_step, Py_ssize_t* c_length) except -1: 791 u"""Resolve a children slice. 792 793 Returns the start node, step size and the slice length in the 794 pointer arguments. 795 """ 796 cdef Py_ssize_t start = 0, stop = 0, childcount 797 childcount = _countElements(c_parent.children) 798 if childcount == 0: 799 c_start_node[0] = NULL 800 c_length[0] = 0 801 if sliceobject.step is None: 802 c_step[0] = 1 803 else: 804 python._PyEval_SliceIndex(sliceobject.step, c_step) 805 return 0 806 python.PySlice_GetIndicesEx( 807 sliceobject, childcount, &start, &stop, c_step, c_length) 808 if start > childcount / 2: 809 c_start_node[0] = _findChildBackwards(c_parent, childcount - start - 1) 810 else: 811 c_start_node[0] = _findChild(c_parent, start) 812 return 0 813 814 cdef bint _isFullSlice(slice sliceobject) except -1: 815 u"""Conservative guess if this slice is a full slice as in ``s[:]``. 816 """ 817 cdef Py_ssize_t step = 0 818 if sliceobject is None: 819 return 0 820 if sliceobject.start is None and \ 821 sliceobject.stop is None: 822 if sliceobject.step is None: 823 return 1 824 python._PyEval_SliceIndex(sliceobject.step, &step) 825 if step == 1: 826 return 1 827 return 0 828 return 0 829 830 cdef _collectChildren(_Element element): 831 cdef xmlNode* c_node 832 cdef list result = [] 833 c_node = element._c_node.children 834 if c_node is not NULL: 835 if not _isElement(c_node): 836 c_node = _nextElement(c_node) 837 while c_node is not NULL: 838 result.append(_elementFactory(element._doc, c_node)) 839 c_node = _nextElement(c_node) 840 return result 841 842 cdef inline xmlNode* _findChild(xmlNode* c_node, Py_ssize_t index): 843 if index < 0: 844 return _findChildBackwards(c_node, -index - 1) 845 else: 846 return _findChildForwards(c_node, index) 847 848 cdef inline xmlNode* _findChildForwards(xmlNode* c_node, Py_ssize_t index): 849 u"""Return child element of c_node with index, or return NULL if not found. 850 """ 851 cdef xmlNode* c_child 852 cdef Py_ssize_t c 853 c_child = c_node.children 854 c = 0 855 while c_child is not NULL: 856 if _isElement(c_child): 857 if c == index: 858 return c_child 859 c += 1 860 c_child = c_child.next 861 return NULL 862 863 cdef inline xmlNode* _findChildBackwards(xmlNode* c_node, Py_ssize_t index): 864 u"""Return child element of c_node with index, or return NULL if not found. 865 Search from the end. 866 """ 867 cdef xmlNode* c_child 868 cdef Py_ssize_t c 869 c_child = c_node.last 870 c = 0 871 while c_child is not NULL: 872 if _isElement(c_child): 873 if c == index: 874 return c_child 875 c += 1 876 c_child = c_child.prev 877 return NULL 878 879 cdef inline xmlNode* _textNodeOrSkip(xmlNode* c_node) nogil: 880 u"""Return the node if it's a text node. Skip over ignorable nodes in a 881 series of text nodes. Return NULL if a non-ignorable node is found. 882 883 This is used to skip over XInclude nodes when collecting adjacent text 884 nodes. 885 """ 886 while c_node is not NULL: 887 if c_node.type == tree.XML_TEXT_NODE or \ 888 c_node.type == tree.XML_CDATA_SECTION_NODE: 889 return c_node 890 elif c_node.type == tree.XML_XINCLUDE_START or \ 891 c_node.type == tree.XML_XINCLUDE_END: 892 c_node = c_node.next 893 else: 894 return NULL 895 return NULL 896 897 cdef inline xmlNode* _nextElement(xmlNode* c_node): 898 u"""Given a node, find the next sibling that is an element. 899 """ 900 if c_node is NULL: 901 return NULL 902 c_node = c_node.next 903 while c_node is not NULL: 904 if _isElement(c_node): 905 return c_node 906 c_node = c_node.next 907 return NULL 908 909 cdef inline xmlNode* _previousElement(xmlNode* c_node): 910 u"""Given a node, find the next sibling that is an element. 911 """ 912 if c_node is NULL: 913 return NULL 914 c_node = c_node.prev 915 while c_node is not NULL: 916 if _isElement(c_node): 917 return c_node 918 c_node = c_node.prev 919 return NULL 920 921 cdef inline xmlNode* _parentElement(xmlNode* c_node): 922 u"Given a node, find the parent element." 923 if c_node is NULL or not _isElement(c_node): 924 return NULL 925 c_node = c_node.parent 926 if c_node is NULL or not _isElement(c_node): 927 return NULL 928 return c_node 929 930 cdef inline bint _tagMatches(xmlNode* c_node, const_xmlChar* c_href, const_xmlChar* c_name): 931 u"""Tests if the node matches namespace URI and tag name. 932 933 A node matches if it matches both c_href and c_name. 934 935 A node matches c_href if any of the following is true: 936 * c_href is NULL 937 * its namespace is NULL and c_href is the empty string 938 * its namespace string equals the c_href string 939 940 A node matches c_name if any of the following is true: 941 * c_name is NULL 942 * its name string equals the c_name string 943 """ 944 if c_node is NULL: 945 return 0 946 if c_node.type != tree.XML_ELEMENT_NODE: 947 # not an element, only succeed if we match everything 948 return c_name is NULL and c_href is NULL 949 if c_name is NULL: 950 if c_href is NULL: 951 # always match 952 return 1 953 else: 954 c_node_href = _getNs(c_node) 955 if c_node_href is NULL: 956 return c_href[0] == c'\0' 957 else: 958 return tree.xmlStrcmp(c_node_href, c_href) == 0 959 elif c_href is NULL: 960 if _getNs(c_node) is not NULL: 961 return 0 962 return c_node.name == c_name or tree.xmlStrcmp(c_node.name, c_name) == 0 963 elif c_node.name == c_name or tree.xmlStrcmp(c_node.name, c_name) == 0: 964 c_node_href = _getNs(c_node) 965 if c_node_href is NULL: 966 return c_href[0] == c'\0' 967 else: 968 return tree.xmlStrcmp(c_node_href, c_href) == 0 969 else: 970 return 0 971 972 cdef inline bint _tagMatchesExactly(xmlNode* c_node, qname* c_qname): 973 u"""Tests if the node matches namespace URI and tag name. 974 975 This differs from _tagMatches() in that it does not consider a 976 NULL value in qname.href a wildcard, and that it expects the c_name 977 to be taken from the doc dict, i.e. it only compares the names by 978 address. 979 980 A node matches if it matches both href and c_name of the qname. 981 982 A node matches c_href if any of the following is true: 983 * its namespace is NULL and c_href is the empty string 984 * its namespace string equals the c_href string 985 986 A node matches c_name if any of the following is true: 987 * c_name is NULL 988 * its name string points to the same address (!) as c_name 989 """ 990 return _nsTagMatchesExactly(_getNs(c_node), c_node.name, c_qname) 991 992 cdef inline bint _nsTagMatchesExactly(const_xmlChar* c_node_href, 993 const_xmlChar* c_node_name, 994 qname* c_qname): 995 u"""Tests if name and namespace URI match those of c_qname. 996 997 This differs from _tagMatches() in that it does not consider a 998 NULL value in qname.href a wildcard, and that it expects the c_name 999 to be taken from the doc dict, i.e. it only compares the names by 1000 address. 1001 1002 A node matches if it matches both href and c_name of the qname. 1003 1004 A node matches c_href if any of the following is true: 1005 * its namespace is NULL and c_href is the empty string 1006 * its namespace string equals the c_href string 1007 1008 A node matches c_name if any of the following is true: 1009 * c_name is NULL 1010 * its name string points to the same address (!) as c_name 1011 """ 1012 cdef char* c_href 1013 if c_qname.c_name is not NULL and c_qname.c_name is not c_node_name: 1014 return 0 1015 if c_qname.href is NULL: 1016 return 1 1017 c_href = python.__cstr(c_qname.href) 1018 if c_href[0] == '\0': 1019 return c_node_href is NULL or c_node_href[0] == '\0' 1020 elif c_node_href is NULL: 1021 return 0 1022 else: 1023 return tree.xmlStrcmp(<const_xmlChar*>c_href, c_node_href) == 0 1024 1025 cdef Py_ssize_t _mapTagsToQnameMatchArray(xmlDoc* c_doc, list ns_tags, 1026 qname* c_ns_tags, bint force_into_dict) except -1: 1027 u"""Map a sequence of (name, namespace) pairs to a qname array for efficient 1028 matching with _tagMatchesExactly() above. 1029 1030 Note that each qname struct in the array owns its href byte string object 1031 if it is not NULL. 1032 """ 1033 cdef Py_ssize_t count = 0, i 1034 cdef bytes ns, tag 1035 for ns, tag in ns_tags: 1036 if tag is None: 1037 c_tag = <const_xmlChar*>NULL 1038 elif force_into_dict: 1039 c_tag = tree.xmlDictLookup(c_doc.dict, _xcstr(tag), len(tag)) 1040 if c_tag is NULL: 1041 # clean up before raising the error 1042 for i in xrange(count): 1043 cpython.ref.Py_XDECREF(c_ns_tags[i].href) 1044 raise MemoryError() 1045 else: 1046 c_tag = tree.xmlDictExists(c_doc.dict, _xcstr(tag), len(tag)) 1047 if c_tag is NULL: 1048 # not in the dict => not in the document 1049 continue 1050 c_ns_tags[count].c_name = c_tag 1051 if ns is None: 1052 c_ns_tags[count].href = NULL 1053 else: 1054 cpython.ref.Py_INCREF(ns) # keep an owned reference! 1055 c_ns_tags[count].href = <python.PyObject*>ns 1056 count += 1 1057 return count 1058 1059 cdef int _removeNode(_Document doc, xmlNode* c_node) except -1: 1060 u"""Unlink and free a node and subnodes if possible. Otherwise, make sure 1061 it's self-contained. 1062 """ 1063 cdef xmlNode* c_next 1064 c_next = c_node.next 1065 tree.xmlUnlinkNode(c_node) 1066 _moveTail(c_next, c_node) 1067 if not attemptDeallocation(c_node): 1068 # make namespaces absolute 1069 moveNodeToDocument(doc, c_node.doc, c_node) 1070 return 0 1071 1072 cdef int _removeSiblings(xmlNode* c_element, tree.xmlElementType node_type, bint with_tail) except -1: 1073 cdef xmlNode* c_node 1074 cdef xmlNode* c_next 1075 c_node = c_element.next 1076 while c_node is not NULL: 1077 c_next = _nextElement(c_node) 1078 if c_node.type == node_type: 1079 if with_tail: 1080 _removeText(c_node.next) 1081 tree.xmlUnlinkNode(c_node) 1082 attemptDeallocation(c_node) 1083 c_node = c_next 1084 c_node = c_element.prev 1085 while c_node is not NULL: 1086 c_next = _previousElement(c_node) 1087 if c_node.type == node_type: 1088 if with_tail: 1089 _removeText(c_node.next) 1090 tree.xmlUnlinkNode(c_node) 1091 attemptDeallocation(c_node) 1092 c_node = c_next 1093 return 0 1094 1095 cdef void _moveTail(xmlNode* c_tail, xmlNode* c_target): 1096 cdef xmlNode* c_next 1097 # tail support: look for any text nodes trailing this node and 1098 # move them too 1099 c_tail = _textNodeOrSkip(c_tail) 1100 while c_tail is not NULL: 1101 c_next = _textNodeOrSkip(c_tail.next) 1102 c_target = tree.xmlAddNextSibling(c_target, c_tail) 1103 c_tail = c_next 1104 1105 cdef int _copyTail(xmlNode* c_tail, xmlNode* c_target) except -1: 1106 cdef xmlNode* c_new_tail 1107 # tail copying support: look for any text nodes trailing this node and 1108 # copy it to the target node 1109 c_tail = _textNodeOrSkip(c_tail) 1110 while c_tail is not NULL: 1111 if c_target.doc is not c_tail.doc: 1112 c_new_tail = tree.xmlDocCopyNode(c_tail, c_target.doc, 0) 1113 else: 1114 c_new_tail = tree.xmlCopyNode(c_tail, 0) 1115 if c_new_tail is NULL: 1116 raise MemoryError() 1117 c_target = tree.xmlAddNextSibling(c_target, c_new_tail) 1118 c_tail = _textNodeOrSkip(c_tail.next) 1119 return 0 1120 1121 cdef int _copyNonElementSiblings(xmlNode* c_node, xmlNode* c_target) except -1: 1122 cdef xmlNode* c_copy 1123 cdef xmlNode* c_sibling = c_node 1124 while c_sibling.prev != NULL and \ 1125 (c_sibling.prev.type == tree.XML_PI_NODE or 1126 c_sibling.prev.type == tree.XML_COMMENT_NODE or 1127 c_sibling.prev.type == tree.XML_DTD_NODE): 1128 c_sibling = c_sibling.prev 1129 while c_sibling != c_node: 1130 if c_sibling.type == tree.XML_DTD_NODE: 1131 c_copy = <xmlNode*>_copyDtd(<tree.xmlDtd*>c_sibling) 1132 if c_sibling == <xmlNode*>c_node.doc.intSubset: 1133 c_target.doc.intSubset = <tree.xmlDtd*>c_copy 1134 else: # c_sibling == c_node.doc.extSubset 1135 c_target.doc.extSubset = <tree.xmlDtd*>c_copy 1136 else: 1137 c_copy = tree.xmlDocCopyNode(c_sibling, c_target.doc, 1) 1138 if c_copy is NULL: 1139 raise MemoryError() 1140 tree.xmlAddPrevSibling(c_target, c_copy) 1141 c_sibling = c_sibling.next 1142 while c_sibling.next != NULL and \ 1143 (c_sibling.next.type == tree.XML_PI_NODE or 1144 c_sibling.next.type == tree.XML_COMMENT_NODE): 1145 c_sibling = c_sibling.next 1146 c_copy = tree.xmlDocCopyNode(c_sibling, c_target.doc, 1) 1147 if c_copy is NULL: 1148 raise MemoryError() 1149 tree.xmlAddNextSibling(c_target, c_copy) 1150 1151 cdef int _deleteSlice(_Document doc, xmlNode* c_node, 1152 Py_ssize_t count, Py_ssize_t step) except -1: 1153 u"""Delete slice, ``count`` items starting with ``c_node`` with a step 1154 width of ``step``. 1155 """ 1156 cdef xmlNode* c_next 1157 cdef Py_ssize_t c, i 1158 cdef _node_to_node_function next_element 1159 if c_node is NULL: 1160 return 0 1161 if step > 0: 1162 next_element = _nextElement 1163 else: 1164 step = -step 1165 next_element = _previousElement 1166 # now start deleting nodes 1167 c = 0 1168 c_next = c_node 1169 while c_node is not NULL and c < count: 1170 for i in range(step): 1171 c_next = next_element(c_next) 1172 if c_next is NULL: 1173 break 1174 _removeNode(doc, c_node) 1175 c += 1 1176 c_node = c_next 1177 return 0 1178 1179 cdef int _replaceSlice(_Element parent, xmlNode* c_node, 1180 Py_ssize_t slicelength, Py_ssize_t step, 1181 bint left_to_right, elements) except -1: 1182 u"""Replace the slice of ``count`` elements starting at ``c_node`` with 1183 positive step width ``step`` by the Elements in ``elements``. The 1184 direction is given by the boolean argument ``left_to_right``. 1185 1186 ``c_node`` may be NULL to indicate the end of the children list. 1187 """ 1188 cdef xmlNode* c_orig_neighbour 1189 cdef xmlNode* c_next 1190 cdef xmlDoc* c_source_doc 1191 cdef _Element element 1192 cdef Py_ssize_t seqlength, i, c 1193 cdef _node_to_node_function next_element 1194 assert step > 0 1195 if left_to_right: 1196 next_element = _nextElement 1197 else: 1198 next_element = _previousElement 1199 1200 if not isinstance(elements, (list, tuple)): 1201 elements = list(elements) 1202 1203 if step != 1 or not left_to_right: 1204 # *replacing* children stepwise with list => check size! 1205 seqlength = len(elements) 1206 if seqlength != slicelength: 1207 raise ValueError, f"attempt to assign sequence of size {seqlength} " \ 1208 f"to extended slice of size {slicelength}" 1209 1210 if c_node is NULL: 1211 # no children yet => add all elements straight away 1212 if left_to_right: 1213 for element in elements: 1214 assert element is not None, u"Node must not be None" 1215 _appendChild(parent, element) 1216 else: 1217 for element in elements: 1218 assert element is not None, u"Node must not be None" 1219 _prependChild(parent, element) 1220 return 0 1221 1222 # remove the elements first as some might be re-added 1223 if left_to_right: 1224 # L->R, remember left neighbour 1225 c_orig_neighbour = _previousElement(c_node) 1226 else: 1227 # R->L, remember right neighbour 1228 c_orig_neighbour = _nextElement(c_node) 1229 1230 # We remove the original slice elements one by one. Since we hold 1231 # a Python reference to all elements that we will insert, it is 1232 # safe to let _removeNode() try (and fail) to free them even if 1233 # the element itself or one of its descendents will be reinserted. 1234 c = 0 1235 c_next = c_node 1236 while c_node is not NULL and c < slicelength: 1237 for i in range(step): 1238 c_next = next_element(c_next) 1239 if c_next is NULL: 1240 break 1241 _removeNode(parent._doc, c_node) 1242 c += 1 1243 c_node = c_next 1244 1245 # make sure each element is inserted only once 1246 elements = iter(elements) 1247 1248 # find the first node right of the new insertion point 1249 if left_to_right: 1250 if c_orig_neighbour is not NULL: 1251 c_node = next_element(c_orig_neighbour) 1252 else: 1253 # before the first element 1254 c_node = _findChildForwards(parent._c_node, 0) 1255 elif c_orig_neighbour is NULL: 1256 # at the end, but reversed stepping 1257 # append one element and go to the next insertion point 1258 for element in elements: 1259 assert element is not None, u"Node must not be None" 1260 _appendChild(parent, element) 1261 c_node = element._c_node 1262 if slicelength > 0: 1263 slicelength -= 1 1264 for i in range(1, step): 1265 c_node = next_element(c_node) 1266 if c_node is NULL: 1267 break 1268 break 1269 else: 1270 c_node = c_orig_neighbour 1271 1272 if left_to_right: 1273 # adjust step size after removing slice as we are not stepping 1274 # over the newly inserted elements 1275 step -= 1 1276 1277 # now insert elements where we removed them 1278 if c_node is not NULL: 1279 for element in elements: 1280 assert element is not None, u"Node must not be None" 1281 _assertValidNode(element) 1282 # move element and tail over 1283 c_source_doc = element._c_node.doc 1284 c_next = element._c_node.next 1285 tree.xmlAddPrevSibling(c_node, element._c_node) 1286 _moveTail(c_next, element._c_node) 1287 1288 # integrate element into new document 1289 moveNodeToDocument(parent._doc, c_source_doc, element._c_node) 1290 1291 # stop at the end of the slice 1292 if slicelength > 0: 1293 slicelength -= 1 1294 for i in range(step): 1295 c_node = next_element(c_node) 1296 if c_node is NULL: 1297 break 1298 if c_node is NULL: 1299 break 1300 else: 1301 # everything inserted 1302 return 0 1303 1304 # append the remaining elements at the respective end 1305 if left_to_right: 1306 for element in elements: 1307 assert element is not None, u"Node must not be None" 1308 _assertValidNode(element) 1309 _appendChild(parent, element) 1310 else: 1311 for element in elements: 1312 assert element is not None, u"Node must not be None" 1313 _assertValidNode(element) 1314 _prependChild(parent, element) 1315 1316 return 0 1317 1318 1319 cdef int _linkChild(xmlNode* c_parent, xmlNode* c_node) except -1: 1320 """Adaptation of 'xmlAddChild()' that deep-fix the document links iteratively. 1321 """ 1322 assert _isElement(c_node) 1323 c_node.parent = c_parent 1324 if c_parent.children is NULL: 1325 c_parent.children = c_parent.last = c_node 1326 else: 1327 c_node.prev = c_parent.last 1328 c_parent.last.next = c_node 1329 c_parent.last = c_node 1330 1331 _setTreeDoc(c_node, c_parent.doc) 1332 return 0 1333 1334 1335 cdef int _appendChild(_Element parent, _Element child) except -1: 1336 u"""Append a new child to a parent element. 1337 """ 1338 c_node = child._c_node 1339 c_source_doc = c_node.doc 1340 # prevent cycles 1341 if _isAncestorOrSame(c_node, parent._c_node): 1342 raise ValueError("cannot append parent to itself") 1343 # store possible text node 1344 c_next = c_node.next 1345 # move node itself 1346 tree.xmlUnlinkNode(c_node) 1347 # do not call xmlAddChild() here since it would deep-traverse the tree 1348 _linkChild(parent._c_node, c_node) 1349 _moveTail(c_next, c_node) 1350 # uh oh, elements may be pointing to different doc when 1351 # parent element has moved; change them too.. 1352 moveNodeToDocument(parent._doc, c_source_doc, c_node) 1353 return 0 1354 1355 cdef int _prependChild(_Element parent, _Element child) except -1: 1356 u"""Prepend a new child to a parent element. 1357 """ 1358 c_node = child._c_node 1359 c_source_doc = c_node.doc 1360 # prevent cycles 1361 if _isAncestorOrSame(c_node, parent._c_node): 1362 raise ValueError("cannot append parent to itself") 1363 # store possible text node 1364 c_next = c_node.next 1365 # move node itself 1366 c_child = _findChildForwards(parent._c_node, 0) 1367 if c_child is NULL: 1368 tree.xmlUnlinkNode(c_node) 1369 # do not call xmlAddChild() here since it would deep-traverse the tree 1370 _linkChild(parent._c_node, c_node) 1371 else: 1372 tree.xmlAddPrevSibling(c_child, c_node) 1373 _moveTail(c_next, c_node) 1374 # uh oh, elements may be pointing to different doc when 1375 # parent element has moved; change them too.. 1376 moveNodeToDocument(parent._doc, c_source_doc, c_node) 1377 return 0 1378 1379 cdef int _appendSibling(_Element element, _Element sibling) except -1: 1380 u"""Add a new sibling behind an element. 1381 """ 1382 return _addSibling(element, sibling, as_next=True) 1383 1384 cdef int _prependSibling(_Element element, _Element sibling) except -1: 1385 u"""Add a new sibling before an element. 1386 """ 1387 return _addSibling(element, sibling, as_next=False) 1388 1389 cdef int _addSibling(_Element element, _Element sibling, bint as_next) except -1: 1390 c_node = sibling._c_node 1391 c_source_doc = c_node.doc 1392 # prevent cycles 1393 if _isAncestorOrSame(c_node, element._c_node): 1394 if element._c_node is c_node: 1395 return 0 # nothing to do 1396 raise ValueError("cannot add ancestor as sibling, please break cycle first") 1397 # store possible text node 1398 c_next = c_node.next 1399 # move node itself 1400 if as_next: 1401 tree.xmlAddNextSibling(element._c_node, c_node) 1402 else: 1403 tree.xmlAddPrevSibling(element._c_node, c_node) 1404 _moveTail(c_next, c_node) 1405 # uh oh, elements may be pointing to different doc when 1406 # parent element has moved; change them too.. 1407 moveNodeToDocument(element._doc, c_source_doc, c_node) 1408 return 0 1409 1410 cdef inline bint isutf8(const_xmlChar* s): 1411 cdef xmlChar c = s[0] 1412 while c != c'\0': 1413 if c & 0x80: 1414 return True 1415 s += 1 1416 c = s[0] 1417 return False 1418 1419 cdef bint isutf8l(const_xmlChar* s, size_t length): 1420 """ 1421 Search for non-ASCII characters in the string, knowing its length in advance. 1422 """ 1423 cdef unsigned int i 1424 cdef unsigned long non_ascii_mask 1425 cdef const unsigned long *lptr = <const unsigned long*> s 1426 1427 cdef const unsigned long *end = lptr + length // sizeof(unsigned long) 1428 if length >= sizeof(non_ascii_mask): 1429 # Build constant 0x80808080... mask (and let the C compiler fold it). 1430 non_ascii_mask = 0 1431 for i in range(sizeof(non_ascii_mask) // 2): 1432 non_ascii_mask = (non_ascii_mask << 16) | 0x8080 1433 1434 # Advance to long-aligned character before we start reading longs. 1435 while (<size_t>s) % sizeof(unsigned long) and s < <const_xmlChar *>end: 1436 if s[0] & 0x80: 1437 return True 1438 s += 1 1439 1440 # Read one long at a time 1441 lptr = <const unsigned long*> s 1442 while lptr < end: 1443 if lptr[0] & non_ascii_mask: 1444 return True 1445 lptr += 1 1446 s = <const_xmlChar *>lptr 1447 1448 while s < (<const_xmlChar *>end + length % sizeof(unsigned long)): 1449 if s[0] & 0x80: 1450 return True 1451 s += 1 1452 1453 return False 1454 1455 cdef int _is_valid_xml_ascii(bytes pystring): 1456 """Check if a string is XML ascii content.""" 1457 cdef signed char ch 1458 # When ch is a *signed* char, non-ascii characters are negative integers 1459 # and xmlIsChar_ch does not accept them. 1460 for ch in pystring: 1461 if not tree.xmlIsChar_ch(ch): 1462 return 0 1463 return 1 1464 1465 cdef bint _is_valid_xml_utf8(bytes pystring): 1466 u"""Check if a string is like valid UTF-8 XML content.""" 1467 cdef const_xmlChar* s = _xcstr(pystring) 1468 cdef const_xmlChar* c_end = s + len(pystring) 1469 cdef unsigned long next3 = 0 1470 if s < c_end - 2: 1471 next3 = (s[0] << 8) | (s[1]) 1472 1473 while s < c_end - 2: 1474 next3 = 0x00ffffff & ((next3 << 8) | s[2]) 1475 if s[0] & 0x80: 1476 # 0xefbfbe and 0xefbfbf are utf-8 encodings of 1477 # forbidden characters \ufffe and \uffff 1478 if next3 == 0x00efbfbe or next3 == 0x00efbfbf: 1479 return 0 1480 # 0xeda080 and 0xedbfbf are utf-8 encodings of 1481 # \ud800 and \udfff. Anything between them (inclusive) 1482 # is forbidden, because they are surrogate blocks in utf-16. 1483 if 0x00eda080 <= next3 <= 0x00edbfbf: 1484 return 0 1485 elif not tree.xmlIsChar_ch(s[0]): 1486 return 0 # invalid ascii char 1487 s += 1 1488 1489 while s < c_end: 1490 if not s[0] & 0x80 and not tree.xmlIsChar_ch(s[0]): 1491 return 0 # invalid ascii char 1492 s += 1 1493 1494 return 1 1495 1496 cdef inline object funicodeOrNone(const_xmlChar* s): 1497 return funicode(s) if s is not NULL else None 1498 1499 cdef inline object funicodeOrEmpty(const_xmlChar* s): 1500 return funicode(s) if s is not NULL else '' 1501 1502 cdef object funicode(const_xmlChar* s): 1503 cdef Py_ssize_t slen 1504 cdef const_xmlChar* spos 1505 cdef bint is_non_ascii 1506 if python.LXML_UNICODE_STRINGS: 1507 return s.decode('UTF-8') 1508 spos = s 1509 is_non_ascii = 0 1510 while spos[0] != c'\0': 1511 if spos[0] & 0x80: 1512 is_non_ascii = 1 1513 break 1514 spos += 1 1515 slen = spos - s 1516 if spos[0] != c'\0': 1517 slen += cstring_h.strlen(<const char*> spos) 1518 if is_non_ascii: 1519 return s[:slen].decode('UTF-8') 1520 return <bytes>s[:slen] 1521 1522 cdef bytes _utf8(object s): 1523 """Test if a string is valid user input and encode it to UTF-8. 1524 Reject all bytes/unicode input that contains non-XML characters. 1525 Reject all bytes input that contains non-ASCII characters. 1526 """ 1527 cdef int valid 1528 cdef bytes utf8_string 1529 if python.IS_PYTHON2 and type(s) is bytes: 1530 utf8_string = <bytes>s 1531 valid = _is_valid_xml_ascii(utf8_string) 1532 elif isinstance(s, unicode): 1533 utf8_string = (<unicode>s).encode('utf8') 1534 valid = _is_valid_xml_utf8(utf8_string) 1535 elif isinstance(s, (bytes, bytearray)): 1536 utf8_string = bytes(s) 1537 valid = _is_valid_xml_ascii(utf8_string) 1538 else: 1539 raise TypeError("Argument must be bytes or unicode, got '%.200s'" % type(s).__name__) 1540 if not valid: 1541 raise ValueError( 1542 "All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters") 1543 return utf8_string 1544 1545 1546 cdef bytes _utf8orNone(object s): 1547 return _utf8(s) if s is not None else None 1548 1549 1550 cdef strrepr(s): 1551 """Build a representation of strings which we can use in __repr__ 1552 methods, e.g. _Element.__repr__(). 1553 """ 1554 return s.encode('unicode-escape') if python.IS_PYTHON2 else s 1555 1556 1557 cdef enum: 1558 NO_FILE_PATH = 0 1559 ABS_UNIX_FILE_PATH = 1 1560 ABS_WIN_FILE_PATH = 2 1561 REL_FILE_PATH = 3 1562 1563 1564 cdef bint _isFilePath(const_xmlChar* c_path): 1565 u"simple heuristic to see if a path is a filename" 1566 cdef xmlChar c 1567 # test if it looks like an absolute Unix path or a Windows network path 1568 if c_path[0] == c'/': 1569 return ABS_UNIX_FILE_PATH 1570 1571 # test if it looks like an absolute Windows path or URL 1572 if c'a' <= c_path[0] <= c'z' or c'A' <= c_path[0] <= c'Z': 1573 c_path += 1 1574 if c_path[0] == c':' and c_path[1] in b'\0\\': 1575 return ABS_WIN_FILE_PATH # C: or C:\... 1576 1577 # test if it looks like a URL with scheme:// 1578 while c'a' <= c_path[0] <= c'z' or c'A' <= c_path[0] <= c'Z': 1579 c_path += 1 1580 if c_path[0] == c':' and c_path[1] == c'/' and c_path[2] == c'/': 1581 return NO_FILE_PATH 1582 1583 # assume it's a relative path 1584 return REL_FILE_PATH 1585 1586 cdef object _NO_FSPATH = object() 1587 1588 cdef object _getFSPathOrObject(object obj): 1589 """ 1590 Get the __fspath__ attribute of an object if it exists. 1591 Otherwise, the original object is returned. 1592 """ 1593 if _isString(obj): 1594 return obj 1595 if python.PY_VERSION_HEX >= 0x03060000: 1596 try: 1597 return python.PY_FSPath(obj) 1598 except TypeError: 1599 return obj 1600 fspath = getattr(obj, '__fspath__', _NO_FSPATH) 1601 if fspath is not _NO_FSPATH and callable(fspath): 1602 return fspath() 1603 return obj 1604 1605 cdef object _encodeFilename(object filename): 1606 u"""Make sure a filename is 8-bit encoded (or None). 1607 """ 1608 if filename is None: 1609 return None 1610 elif isinstance(filename, bytes): 1611 return filename 1612 elif isinstance(filename, unicode): 1613 filename8 = (<unicode>filename).encode('utf8') 1614 if _isFilePath(<unsigned char*>filename8): 1615 try: 1616 return python.PyUnicode_AsEncodedString( 1617 filename, _C_FILENAME_ENCODING, NULL) 1618 except UnicodeEncodeError: 1619 pass 1620 return filename8 1621 else: 1622 raise TypeError("Argument must be string or unicode.") 1623 1624 cdef object _decodeFilename(const_xmlChar* c_path): 1625 u"""Make the filename a unicode string if we are in Py3. 1626 """ 1627 return _decodeFilenameWithLength(c_path, tree.xmlStrlen(c_path)) 1628 1629 cdef object _decodeFilenameWithLength(const_xmlChar* c_path, size_t c_len): 1630 u"""Make the filename a unicode string if we are in Py3. 1631 """ 1632 if _isFilePath(c_path): 1633 try: 1634 return python.PyUnicode_Decode( 1635 <const_char*>c_path, c_len, _C_FILENAME_ENCODING, NULL) 1636 except UnicodeDecodeError: 1637 pass 1638 try: 1639 return (<unsigned char*>c_path)[:c_len].decode('UTF-8') 1640 except UnicodeDecodeError: 1641 # this is a stupid fallback, but it might still work... 1642 return (<unsigned char*>c_path)[:c_len].decode('latin-1', 'replace') 1643 1644 cdef object _encodeFilenameUTF8(object filename): 1645 u"""Recode filename as UTF-8. Tries ASCII, local filesystem encoding and 1646 UTF-8 as source encoding. 1647 """ 1648 cdef char* c_filename 1649 if filename is None: 1650 return None 1651 elif isinstance(filename, bytes): 1652 if not isutf8l(<bytes>filename, len(<bytes>filename)): 1653 # plain ASCII! 1654 return filename 1655 c_filename = _cstr(<bytes>filename) 1656 try: 1657 # try to decode with default encoding 1658 filename = python.PyUnicode_Decode( 1659 c_filename, len(<bytes>filename), 1660 _C_FILENAME_ENCODING, NULL) 1661 except UnicodeDecodeError as decode_exc: 1662 try: 1663 # try if it's proper UTF-8 1664 (<bytes>filename).decode('utf8') 1665 return filename 1666 except UnicodeDecodeError: 1667 raise decode_exc # otherwise re-raise original exception 1668 if isinstance(filename, unicode): 1669 return (<unicode>filename).encode('utf8') 1670 else: 1671 raise TypeError("Argument must be string or unicode.") 1672 1673 cdef tuple _getNsTag(tag): 1674 u"""Given a tag, find namespace URI and tag name. 1675 Return None for NS uri if no namespace URI provided. 1676 """ 1677 return __getNsTag(tag, 0) 1678 1679 cdef tuple _getNsTagWithEmptyNs(tag): 1680 u"""Given a tag, find namespace URI and tag name. Return None for NS uri 1681 if no namespace URI provided, or the empty string if namespace 1682 part is '{}'. 1683 """ 1684 return __getNsTag(tag, 1) 1685 1686 cdef tuple __getNsTag(tag, bint empty_ns): 1687 cdef char* c_tag 1688 cdef char* c_ns_end 1689 cdef Py_ssize_t taglen 1690 cdef Py_ssize_t nslen 1691 cdef bytes ns = None 1692 # _isString() is much faster than isinstance() 1693 if not _isString(tag) and isinstance(tag, QName): 1694 tag = (<QName>tag).text 1695 tag = _utf8(tag) 1696 c_tag = _cstr(tag) 1697 if c_tag[0] == c'{': 1698 c_tag += 1 1699 c_ns_end = cstring_h.strchr(c_tag, c'}') 1700 if c_ns_end is NULL: 1701 raise ValueError, u"Invalid tag name" 1702 nslen = c_ns_end - c_tag 1703 taglen = python.PyBytes_GET_SIZE(tag) - nslen - 2 1704 if taglen == 0: 1705 raise ValueError, u"Empty tag name" 1706 if nslen > 0: 1707 ns = <bytes>c_tag[:nslen] 1708 elif empty_ns: 1709 ns = b'' 1710 tag = <bytes>c_ns_end[1:taglen+1] 1711 elif python.PyBytes_GET_SIZE(tag) == 0: 1712 raise ValueError, u"Empty tag name" 1713 return ns, tag 1714 1715 cdef inline int _pyXmlNameIsValid(name_utf8): 1716 return _xmlNameIsValid(_xcstr(name_utf8)) and b':' not in name_utf8 1717 1718 cdef inline int _pyHtmlNameIsValid(name_utf8): 1719 return _htmlNameIsValid(_xcstr(name_utf8)) 1720 1721 cdef inline int _xmlNameIsValid(const_xmlChar* c_name): 1722 return tree.xmlValidateNameValue(c_name) 1723 1724 cdef int _htmlNameIsValid(const_xmlChar* c_name): 1725 if c_name is NULL or c_name[0] == c'\0': 1726 return 0 1727 while c_name[0] != c'\0': 1728 if c_name[0] in b'&<>/"\'\t\n\x0B\x0C\r ': 1729 return 0 1730 c_name += 1 1731 return 1 1732 1733 cdef bint _characterReferenceIsValid(const_xmlChar* c_name): 1734 cdef bint is_hex 1735 if c_name[0] == c'x': 1736 c_name += 1 1737 is_hex = 1 1738 else: 1739 is_hex = 0 1740 if c_name[0] == c'\0': 1741 return 0 1742 while c_name[0] != c'\0': 1743 if c_name[0] < c'0' or c_name[0] > c'9': 1744 if not is_hex: 1745 return 0 1746 if not (c'a' <= c_name[0] <= c'f'): 1747 if not (c'A' <= c_name[0] <= c'F'): 1748 return 0 1749 c_name += 1 1750 return 1 1751 1752 cdef int _tagValidOrRaise(tag_utf) except -1: 1753 if not _pyXmlNameIsValid(tag_utf): 1754 raise ValueError(f"Invalid tag name {(<bytes>tag_utf).decode('utf8')!r}") 1755 return 0 1756 1757 cdef int _htmlTagValidOrRaise(tag_utf) except -1: 1758 if not _pyHtmlNameIsValid(tag_utf): 1759 raise ValueError(f"Invalid HTML tag name {(<bytes>tag_utf).decode('utf8')!r}") 1760 return 0 1761 1762 cdef int _attributeValidOrRaise(name_utf) except -1: 1763 if not _pyXmlNameIsValid(name_utf): 1764 raise ValueError(f"Invalid attribute name {(<bytes>name_utf).decode('utf8')!r}") 1765 return 0 1766 1767 cdef int _prefixValidOrRaise(tag_utf) except -1: 1768 if not _pyXmlNameIsValid(tag_utf): 1769 raise ValueError(f"Invalid namespace prefix {(<bytes>tag_utf).decode('utf8')!r}") 1770 return 0 1771 1772 cdef int _uriValidOrRaise(uri_utf) except -1: 1773 cdef uri.xmlURI* c_uri = uri.xmlParseURI(_cstr(uri_utf)) 1774 if c_uri is NULL: 1775 raise ValueError(f"Invalid namespace URI {(<bytes>uri_utf).decode('utf8')!r}") 1776 uri.xmlFreeURI(c_uri) 1777 return 0 1778 1779 cdef inline object _namespacedName(xmlNode* c_node): 1780 return _namespacedNameFromNsName(_getNs(c_node), c_node.name) 1781 1782 cdef object _namespacedNameFromNsName(const_xmlChar* href, const_xmlChar* name): 1783 if href is NULL: 1784 return funicode(name) 1785 elif not python.IS_PYPY and (python.LXML_UNICODE_STRINGS or isutf8(name) or isutf8(href)): 1786 return python.PyUnicode_FromFormat("{%s}%s", href, name) 1787 else: 1788 s = python.PyBytes_FromFormat("{%s}%s", href, name) 1789 if python.IS_PYPY and (python.LXML_UNICODE_STRINGS or isutf8l(s, len(s))): 1790 return (<bytes>s).decode('utf8') 1791 else: 1792 return s 1793 1794 cdef _getFilenameForFile(source): 1795 u"""Given a Python File or Gzip object, give filename back. 1796 1797 Returns None if not a file object. 1798 """ 1799 # urllib2 provides a geturl() method 1800 try: 1801 return source.geturl() 1802 except: 1803 pass 1804 # file instances have a name attribute 1805 try: 1806 filename = source.name 1807 if _isString(filename): 1808 return os_path_abspath(filename) 1809 except: 1810 pass 1811 # gzip file instances have a filename attribute (before Py3k) 1812 try: 1813 filename = source.filename 1814 if _isString(filename): 1815 return os_path_abspath(filename) 1816 except: 1817 pass 1818 # can't determine filename 1819 return None