proxy.pxi
1 # Proxy functions and low level node allocation stuff 2 3 # Proxies represent elements, their reference is stored in the C 4 # structure of the respective node to avoid multiple instantiation of 5 # the Python class. 6 7 @cython.linetrace(False) 8 @cython.profile(False) 9 cdef inline _Element getProxy(xmlNode* c_node): 10 u"""Get a proxy for a given node. 11 """ 12 #print "getProxy for:", <int>c_node 13 if c_node is not NULL and c_node._private is not NULL: 14 return <_Element>c_node._private 15 else: 16 return None 17 18 19 @cython.linetrace(False) 20 @cython.profile(False) 21 cdef inline bint hasProxy(xmlNode* c_node): 22 if c_node._private is NULL: 23 return False 24 return True 25 26 27 @cython.linetrace(False) 28 @cython.profile(False) 29 cdef inline int _registerProxy(_Element proxy, _Document doc, 30 xmlNode* c_node) except -1: 31 u"""Register a proxy and type for the node it's proxying for. 32 """ 33 #print "registering for:", <int>proxy._c_node 34 assert not hasProxy(c_node), u"double registering proxy!" 35 proxy._doc = doc 36 proxy._c_node = c_node 37 c_node._private = <void*>proxy 38 return 0 39 40 41 @cython.linetrace(False) 42 @cython.profile(False) 43 cdef inline int _unregisterProxy(_Element proxy) except -1: 44 u"""Unregister a proxy for the node it's proxying for. 45 """ 46 cdef xmlNode* c_node = proxy._c_node 47 assert c_node._private is <void*>proxy, u"Tried to unregister unknown proxy" 48 c_node._private = NULL 49 return 0 50 51 52 ################################################################################ 53 # temporarily make a node the root node of its document 54 55 cdef xmlDoc* _fakeRootDoc(xmlDoc* c_base_doc, xmlNode* c_node) except NULL: 56 return _plainFakeRootDoc(c_base_doc, c_node, 1) 57 58 cdef xmlDoc* _plainFakeRootDoc(xmlDoc* c_base_doc, xmlNode* c_node, 59 bint with_siblings) except NULL: 60 # build a temporary document that has the given node as root node 61 # note that copy and original must not be modified during its lifetime!! 62 # always call _destroyFakeDoc() after use! 63 cdef xmlNode* c_child 64 cdef xmlNode* c_root 65 cdef xmlNode* c_new_root 66 cdef xmlDoc* c_doc 67 if with_siblings or (c_node.prev is NULL and c_node.next is NULL): 68 c_root = tree.xmlDocGetRootElement(c_base_doc) 69 if c_root is c_node: 70 # already the root node, no siblings 71 return c_base_doc 72 73 c_doc = _copyDoc(c_base_doc, 0) # non recursive! 74 c_new_root = tree.xmlDocCopyNode(c_node, c_doc, 2) # non recursive! 75 tree.xmlDocSetRootElement(c_doc, c_new_root) 76 _copyParentNamespaces(c_node, c_new_root) 77 78 c_new_root.children = c_node.children 79 c_new_root.last = c_node.last 80 c_new_root.next = c_new_root.prev = NULL 81 82 # store original node 83 c_doc._private = c_node 84 85 # divert parent pointers of children 86 c_child = c_new_root.children 87 while c_child is not NULL: 88 c_child.parent = c_new_root 89 c_child = c_child.next 90 91 c_doc.children = c_new_root 92 return c_doc 93 94 cdef void _destroyFakeDoc(xmlDoc* c_base_doc, xmlDoc* c_doc): 95 # delete a temporary document 96 cdef xmlNode* c_child 97 cdef xmlNode* c_parent 98 cdef xmlNode* c_root 99 if c_doc is c_base_doc: 100 return 101 c_root = tree.xmlDocGetRootElement(c_doc) 102 103 # restore parent pointers of children 104 c_parent = <xmlNode*>c_doc._private 105 c_child = c_root.children 106 while c_child is not NULL: 107 c_child.parent = c_parent 108 c_child = c_child.next 109 110 # prevent recursive removal of children 111 c_root.children = c_root.last = NULL 112 tree.xmlFreeDoc(c_doc) 113 114 cdef _Element _fakeDocElementFactory(_Document doc, xmlNode* c_element): 115 u"""Special element factory for cases where we need to create a fake 116 root document, but still need to instantiate arbitrary nodes from 117 it. If we instantiate the fake root node, things will turn bad 118 when it's destroyed. 119 120 Instead, if we are asked to instantiate the fake root node, we 121 instantiate the original node instead. 122 """ 123 if c_element.doc is not doc._c_doc: 124 if c_element.doc._private is not NULL: 125 if c_element is c_element.doc.children: 126 c_element = <xmlNode*>c_element.doc._private 127 #assert c_element.type == tree.XML_ELEMENT_NODE 128 return _elementFactory(doc, c_element) 129 130 ################################################################################ 131 # support for freeing tree elements when proxy objects are destroyed 132 133 cdef int attemptDeallocation(xmlNode* c_node): 134 u"""Attempt deallocation of c_node (or higher up in tree). 135 """ 136 cdef xmlNode* c_top 137 # could be we actually aren't referring to the tree at all 138 if c_node is NULL: 139 #print "not freeing, node is NULL" 140 return 0 141 c_top = getDeallocationTop(c_node) 142 if c_top is not NULL: 143 #print "freeing:", c_top.name 144 _removeText(c_top.next) # tail 145 tree.xmlFreeNode(c_top) 146 return 1 147 return 0 148 149 cdef xmlNode* getDeallocationTop(xmlNode* c_node): 150 u"""Return the top of the tree that can be deallocated, or NULL. 151 """ 152 cdef xmlNode* c_next 153 #print "trying to do deallocating:", c_node.type 154 if hasProxy(c_node): 155 #print "Not freeing: proxies still exist" 156 return NULL 157 while c_node.parent is not NULL: 158 c_node = c_node.parent 159 #print "checking:", c_current.type 160 if c_node.type == tree.XML_DOCUMENT_NODE or \ 161 c_node.type == tree.XML_HTML_DOCUMENT_NODE: 162 #print "not freeing: still in doc" 163 return NULL 164 # if we're still attached to the document, don't deallocate 165 if hasProxy(c_node): 166 #print "Not freeing: proxies still exist" 167 return NULL 168 # see whether we have children to deallocate 169 if not canDeallocateChildNodes(c_node): 170 return NULL 171 # see whether we have siblings to deallocate 172 c_next = c_node.prev 173 while c_next: 174 if _isElement(c_next): 175 if hasProxy(c_next) or not canDeallocateChildNodes(c_next): 176 return NULL 177 c_next = c_next.prev 178 c_next = c_node.next 179 while c_next: 180 if _isElement(c_next): 181 if hasProxy(c_next) or not canDeallocateChildNodes(c_next): 182 return NULL 183 c_next = c_next.next 184 return c_node 185 186 cdef int canDeallocateChildNodes(xmlNode* c_parent): 187 cdef xmlNode* c_node 188 c_node = c_parent.children 189 tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_parent, c_node, 1) 190 if hasProxy(c_node): 191 return 0 192 tree.END_FOR_EACH_ELEMENT_FROM(c_node) 193 return 1 194 195 ################################################################################ 196 # fix _Document references and namespaces when a node changes documents 197 198 cdef void _copyParentNamespaces(xmlNode* c_from_node, xmlNode* c_to_node) nogil: 199 u"""Copy the namespaces of all ancestors of c_from_node to c_to_node. 200 """ 201 cdef xmlNode* c_parent 202 cdef xmlNs* c_ns 203 cdef xmlNs* c_new_ns 204 cdef int prefix_known 205 c_parent = c_from_node.parent 206 while c_parent and (tree._isElementOrXInclude(c_parent) or 207 c_parent.type == tree.XML_DOCUMENT_NODE): 208 c_new_ns = c_parent.nsDef 209 while c_new_ns: 210 # libxml2 will check if the prefix is already defined 211 tree.xmlNewNs(c_to_node, c_new_ns.href, c_new_ns.prefix) 212 c_new_ns = c_new_ns.next 213 c_parent = c_parent.parent 214 215 216 ctypedef struct _ns_update_map: 217 xmlNs* old 218 xmlNs* new 219 220 221 ctypedef struct _nscache: 222 _ns_update_map* ns_map 223 size_t size 224 size_t last 225 226 227 cdef int _growNsCache(_nscache* c_ns_cache) except -1: 228 cdef _ns_update_map* ns_map_ptr 229 if c_ns_cache.size == 0: 230 c_ns_cache.size = 20 231 else: 232 c_ns_cache.size *= 2 233 ns_map_ptr = <_ns_update_map*> python.lxml_realloc( 234 c_ns_cache.ns_map, c_ns_cache.size, sizeof(_ns_update_map)) 235 if not ns_map_ptr: 236 python.lxml_free(c_ns_cache.ns_map) 237 c_ns_cache.ns_map = NULL 238 raise MemoryError() 239 c_ns_cache.ns_map = ns_map_ptr 240 return 0 241 242 243 cdef inline int _appendToNsCache(_nscache* c_ns_cache, 244 xmlNs* c_old_ns, xmlNs* c_new_ns) except -1: 245 if c_ns_cache.last >= c_ns_cache.size: 246 _growNsCache(c_ns_cache) 247 c_ns_cache.ns_map[c_ns_cache.last] = _ns_update_map(old=c_old_ns, new=c_new_ns) 248 c_ns_cache.last += 1 249 250 251 cdef int _stripRedundantNamespaceDeclarations(xmlNode* c_element, _nscache* c_ns_cache, 252 xmlNs** c_del_ns_list) except -1: 253 u"""Removes namespace declarations from an element that are already 254 defined in its parents. Does not free the xmlNs's, just prepends 255 them to the c_del_ns_list. 256 """ 257 cdef xmlNs* c_ns 258 cdef xmlNs* c_ns_next 259 cdef xmlNs** c_nsdef 260 # use a xmlNs** to handle assignments to "c_element.nsDef" correctly 261 c_nsdef = &c_element.nsDef 262 while c_nsdef[0] is not NULL: 263 c_ns = tree.xmlSearchNsByHref( 264 c_element.doc, c_element.parent, c_nsdef[0].href) 265 if c_ns is NULL: 266 # new namespace href => keep and cache the ns declaration 267 _appendToNsCache(c_ns_cache, c_nsdef[0], c_nsdef[0]) 268 c_nsdef = &c_nsdef[0].next 269 else: 270 # known namespace href => cache mapping and strip old ns 271 _appendToNsCache(c_ns_cache, c_nsdef[0], c_ns) 272 # cut out c_nsdef.next and prepend it to garbage chain 273 c_ns_next = c_nsdef[0].next 274 c_nsdef[0].next = c_del_ns_list[0] 275 c_del_ns_list[0] = c_nsdef[0] 276 c_nsdef[0] = c_ns_next 277 return 0 278 279 280 cdef void _cleanUpFromNamespaceAdaptation(xmlNode* c_start_node, 281 _nscache* c_ns_cache, xmlNs* c_del_ns_list): 282 # Try to recover from exceptions with really bad timing. We were in the middle 283 # of ripping out xmlNS-es and likely ran out of memory. Try to fix up the tree 284 # by re-adding the original xmlNs declarations (which might still be used in some 285 # places). 286 if c_ns_cache.ns_map: 287 python.lxml_free(c_ns_cache.ns_map) 288 if c_del_ns_list: 289 if not c_start_node.nsDef: 290 c_start_node.nsDef = c_del_ns_list 291 else: 292 c_ns = c_start_node.nsDef 293 while c_ns.next: 294 c_ns = c_ns.next 295 c_ns.next = c_del_ns_list 296 297 298 cdef int moveNodeToDocument(_Document doc, xmlDoc* c_source_doc, 299 xmlNode* c_element) except -1: 300 u"""Fix the xmlNs pointers of a node and its subtree that were moved. 301 302 Originally copied from libxml2's xmlReconciliateNs(). Expects 303 libxml2 doc pointers of node to be correct already, but fixes 304 _Document references. 305 306 For each node in the subtree, we do this: 307 308 1) Remove redundant declarations of namespace that are already 309 defined in its parents. 310 311 2) Replace namespaces that are *not* defined on the node or its 312 parents by the equivalent namespace declarations that *are* 313 defined on the node or its parents (possibly using a different 314 prefix). If a namespace is unknown, declare a new one on the 315 node. 316 317 3) Reassign the names of tags and attribute from the dict of the 318 target document *iff* it is different from the dict used in the 319 source subtree. 320 321 4) Set the Document reference to the new Document (if different). 322 This is done on backtracking to keep the original Document 323 alive as long as possible, until all its elements are updated. 324 325 Note that the namespace declarations are removed from the tree in 326 step 1), but freed only after the complete subtree was traversed 327 and all occurrences were replaced by tree-internal pointers. 328 """ 329 cdef xmlNode* c_start_node 330 cdef xmlNode* c_node 331 cdef xmlDoc* c_doc = doc._c_doc 332 cdef tree.xmlAttr* c_attr 333 cdef char* c_name 334 cdef _nscache c_ns_cache = [NULL, 0, 0] 335 cdef xmlNs* c_del_ns_list = NULL 336 cdef proxy_count = 0 337 338 if not tree._isElementOrXInclude(c_element): 339 return 0 340 341 c_start_node = c_element 342 343 tree.BEGIN_FOR_EACH_FROM(c_element, c_element, 1) 344 if tree._isElementOrXInclude(c_element): 345 if hasProxy(c_element): 346 proxy_count += 1 347 348 # 1) cut out namespaces defined here that are already known by 349 # the ancestors 350 if c_element.nsDef is not NULL: 351 try: 352 _stripRedundantNamespaceDeclarations(c_element, &c_ns_cache, &c_del_ns_list) 353 except: 354 _cleanUpFromNamespaceAdaptation(c_start_node, &c_ns_cache, c_del_ns_list) 355 raise 356 357 # 2) make sure the namespaces of an element and its attributes 358 # are declared in this document (i.e. on the node or its parents) 359 if c_element.ns is not NULL: 360 _fixCNs(doc, c_start_node, c_element, &c_ns_cache, c_del_ns_list) 361 362 c_node = <xmlNode*>c_element.properties 363 while c_node is not NULL: 364 if c_node.ns is not NULL: 365 _fixCNs(doc, c_start_node, c_node, &c_ns_cache, c_del_ns_list) 366 c_node = c_node.next 367 368 tree.END_FOR_EACH_FROM(c_element) 369 370 # free now unused namespace declarations 371 if c_del_ns_list is not NULL: 372 tree.xmlFreeNsList(c_del_ns_list) 373 374 # cleanup 375 if c_ns_cache.ns_map is not NULL: 376 python.lxml_free(c_ns_cache.ns_map) 377 378 # 3) fix the names in the tree if we moved it from a different thread 379 if doc._c_doc.dict is not c_source_doc.dict: 380 fixThreadDictNames(c_start_node, c_source_doc.dict, doc._c_doc.dict) 381 382 # 4) fix _Document references 383 # (and potentially deallocate the source document) 384 if proxy_count > 0: 385 if proxy_count == 1 and c_start_node._private is not NULL: 386 proxy = getProxy(c_start_node) 387 if proxy is not None: 388 if proxy._doc is not doc: 389 proxy._doc = doc 390 else: 391 fixElementDocument(c_start_node, doc, proxy_count) 392 else: 393 fixElementDocument(c_start_node, doc, proxy_count) 394 395 return 0 396 397 398 cdef void _setTreeDoc(xmlNode* c_node, xmlDoc* c_doc): 399 """Adaptation of 'xmlSetTreeDoc()' that deep-fixes the document links iteratively. 400 It avoids https://gitlab.gnome.org/GNOME/libxml2/issues/42 401 """ 402 tree.BEGIN_FOR_EACH_FROM(c_node, c_node, 1) 403 if c_node.type == tree.XML_ELEMENT_NODE: 404 c_attr = <tree.xmlAttr*>c_node.properties 405 while c_attr: 406 if c_attr.atype == tree.XML_ATTRIBUTE_ID: 407 tree.xmlRemoveID(c_node.doc, c_attr) 408 c_attr.doc = c_doc 409 _fixDocChildren(c_attr.children, c_doc) 410 c_attr = c_attr.next 411 # Set doc link for all nodes, not only elements. 412 c_node.doc = c_doc 413 tree.END_FOR_EACH_FROM(c_node) 414 415 416 cdef inline void _fixDocChildren(xmlNode* c_child, xmlDoc* c_doc): 417 while c_child: 418 c_child.doc = c_doc 419 if c_child.children: 420 _fixDocChildren(c_child.children, c_doc) 421 c_child = c_child.next 422 423 424 cdef int _fixCNs(_Document doc, xmlNode* c_start_node, xmlNode* c_node, 425 _nscache* c_ns_cache, xmlNs* c_del_ns_list) except -1: 426 cdef xmlNs* c_ns = NULL 427 cdef bint is_prefixed_attr = (c_node.type == tree.XML_ATTRIBUTE_NODE and c_node.ns.prefix) 428 429 for ns_map in c_ns_cache.ns_map[:c_ns_cache.last]: 430 if c_node.ns is ns_map.old: 431 if is_prefixed_attr and not ns_map.new.prefix: 432 # avoid dropping prefix from attributes 433 continue 434 c_ns = ns_map.new 435 break 436 437 if c_ns: 438 c_node.ns = c_ns 439 else: 440 # not in cache or not acceptable 441 # => find a replacement from this document 442 try: 443 c_ns = doc._findOrBuildNodeNs( 444 c_start_node, c_node.ns.href, c_node.ns.prefix, 445 c_node.type == tree.XML_ATTRIBUTE_NODE) 446 c_node.ns = c_ns 447 _appendToNsCache(c_ns_cache, c_node.ns, c_ns) 448 except: 449 _cleanUpFromNamespaceAdaptation(c_start_node, c_ns_cache, c_del_ns_list) 450 raise 451 return 0 452 453 454 cdef void fixElementDocument(xmlNode* c_element, _Document doc, 455 size_t proxy_count): 456 cdef xmlNode* c_node = c_element 457 cdef _Element proxy = None # init-to-None required due to fake-loop below 458 tree.BEGIN_FOR_EACH_FROM(c_element, c_node, 1) 459 if c_node._private is not NULL: 460 proxy = getProxy(c_node) 461 if proxy is not None: 462 if proxy._doc is not doc: 463 proxy._doc = doc 464 proxy_count -= 1 465 if proxy_count == 0: 466 return 467 tree.END_FOR_EACH_FROM(c_node) 468 469 470 cdef void fixThreadDictNames(xmlNode* c_element, 471 tree.xmlDict* c_src_dict, 472 tree.xmlDict* c_dict) nogil: 473 # re-assign the names of tags and attributes 474 # 475 # this should only be called when the element is based on a 476 # different libxml2 tag name dictionary 477 if c_element.type == tree.XML_DOCUMENT_NODE or \ 478 c_element.type == tree.XML_HTML_DOCUMENT_NODE: 479 # may define "xml" namespace 480 fixThreadDictNsForNode(c_element, c_src_dict, c_dict) 481 if c_element.doc.extSubset: 482 fixThreadDictNamesForDtd(c_element.doc.extSubset, c_src_dict, c_dict) 483 if c_element.doc.intSubset: 484 fixThreadDictNamesForDtd(c_element.doc.intSubset, c_src_dict, c_dict) 485 c_element = c_element.children 486 while c_element is not NULL: 487 fixThreadDictNamesForNode(c_element, c_src_dict, c_dict) 488 c_element = c_element.next 489 elif tree._isElementOrXInclude(c_element): 490 fixThreadDictNamesForNode(c_element, c_src_dict, c_dict) 491 492 493 cdef inline void _fixThreadDictPtr(const_xmlChar** c_ptr, 494 tree.xmlDict* c_src_dict, 495 tree.xmlDict* c_dict) nogil: 496 c_str = c_ptr[0] 497 if c_str and c_src_dict and tree.xmlDictOwns(c_src_dict, c_str): 498 # return value can be NULL on memory error, but we don't handle that here 499 c_str = tree.xmlDictLookup(c_dict, c_str, -1) 500 if c_str: 501 c_ptr[0] = c_str 502 503 504 cdef void fixThreadDictNamesForNode(xmlNode* c_element, 505 tree.xmlDict* c_src_dict, 506 tree.xmlDict* c_dict) nogil: 507 cdef xmlNode* c_node = c_element 508 tree.BEGIN_FOR_EACH_FROM(c_element, c_node, 1) 509 if c_node.type in (tree.XML_ELEMENT_NODE, tree.XML_XINCLUDE_START): 510 fixThreadDictNamesForAttributes( 511 c_node.properties, c_src_dict, c_dict) 512 fixThreadDictNsForNode(c_node, c_src_dict, c_dict) 513 _fixThreadDictPtr(&c_node.name, c_src_dict, c_dict) 514 elif c_node.type == tree.XML_TEXT_NODE: 515 # libxml2's SAX2 parser interns some indentation space 516 fixThreadDictContentForNode(c_node, c_src_dict, c_dict) 517 elif c_node.type == tree.XML_COMMENT_NODE: 518 pass # don't touch c_node.name 519 else: 520 _fixThreadDictPtr(&c_node.name, c_src_dict, c_dict) 521 tree.END_FOR_EACH_FROM(c_node) 522 523 524 cdef inline void fixThreadDictNamesForAttributes(tree.xmlAttr* c_attr, 525 tree.xmlDict* c_src_dict, 526 tree.xmlDict* c_dict) nogil: 527 cdef xmlNode* c_child 528 cdef xmlNode* c_node = <xmlNode*>c_attr 529 while c_node is not NULL: 530 if c_node.type not in (tree.XML_TEXT_NODE, tree.XML_COMMENT_NODE): 531 _fixThreadDictPtr(&c_node.name, c_src_dict, c_dict) 532 # libxml2 keeps some (!) attribute values in the dict 533 c_child = c_node.children 534 while c_child is not NULL: 535 fixThreadDictContentForNode(c_child, c_src_dict, c_dict) 536 c_child = c_child.next 537 c_node = c_node.next 538 539 540 cdef inline void fixThreadDictContentForNode(xmlNode* c_node, 541 tree.xmlDict* c_src_dict, 542 tree.xmlDict* c_dict) nogil: 543 if c_node.content is not NULL and \ 544 c_node.content is not <xmlChar*>&c_node.properties: 545 if tree.xmlDictOwns(c_src_dict, c_node.content): 546 # result can be NULL on memory error, but we don't handle that here 547 c_node.content = <xmlChar*>tree.xmlDictLookup(c_dict, c_node.content, -1) 548 549 550 cdef inline void fixThreadDictNsForNode(xmlNode* c_node, 551 tree.xmlDict* c_src_dict, 552 tree.xmlDict* c_dict) nogil: 553 cdef xmlNs* c_ns = c_node.nsDef 554 while c_ns is not NULL: 555 _fixThreadDictPtr(&c_ns.href, c_src_dict, c_dict) 556 _fixThreadDictPtr(&c_ns.prefix, c_src_dict, c_dict) 557 c_ns = c_ns.next 558 559 560 cdef void fixThreadDictNamesForDtd(tree.xmlDtd* c_dtd, 561 tree.xmlDict* c_src_dict, 562 tree.xmlDict* c_dict) nogil: 563 cdef xmlNode* c_node 564 cdef tree.xmlElement* c_element 565 cdef tree.xmlAttribute* c_attribute 566 cdef tree.xmlEntity* c_entity 567 568 c_node = c_dtd.children 569 while c_node: 570 if c_node.type == tree.XML_ELEMENT_DECL: 571 c_element = <tree.xmlElement*>c_node 572 if c_element.content: 573 _fixThreadDictPtr(&c_element.content.name, c_src_dict, c_dict) 574 _fixThreadDictPtr(&c_element.content.prefix, c_src_dict, c_dict) 575 c_attribute = c_element.attributes 576 while c_attribute: 577 _fixThreadDictPtr(&c_attribute.defaultValue, c_src_dict, c_dict) 578 _fixThreadDictPtr(&c_attribute.name, c_src_dict, c_dict) 579 _fixThreadDictPtr(&c_attribute.prefix, c_src_dict, c_dict) 580 _fixThreadDictPtr(&c_attribute.elem, c_src_dict, c_dict) 581 c_attribute = c_attribute.nexth 582 elif c_node.type == tree.XML_ENTITY_DECL: 583 c_entity = <tree.xmlEntity*>c_node 584 _fixThreadDictPtr(&c_entity.name, c_src_dict, c_dict) 585 _fixThreadDictPtr(&c_entity.ExternalID, c_src_dict, c_dict) 586 _fixThreadDictPtr(&c_entity.SystemID, c_src_dict, c_dict) 587 _fixThreadDictPtr(<const_xmlChar**>&c_entity.content, c_src_dict, c_dict) 588 c_node = c_node.next 589 590 591 ################################################################################ 592 # adopt an xmlDoc from an external libxml2 document source 593 594 cdef _Document _adoptForeignDoc(xmlDoc* c_doc, _BaseParser parser=None, bint is_owned=True): 595 """Convert and wrap an externally produced xmlDoc for use in lxml. 596 Assures that all '_private' pointers are NULL to prevent accidental 597 dereference into lxml proxy objects. 598 """ 599 if c_doc is NULL: 600 raise ValueError("Illegal document provided: NULL") 601 if c_doc.type not in (tree.XML_DOCUMENT_NODE, tree.XML_HTML_DOCUMENT_NODE): 602 doc_type = c_doc.type 603 if is_owned: 604 tree.xmlFreeDoc(c_doc) 605 raise ValueError(f"Illegal document provided: expected XML or HTML, found {doc_type}") 606 607 cdef xmlNode* c_node = <xmlNode*>c_doc 608 609 if is_owned: 610 tree.BEGIN_FOR_EACH_FROM(<xmlNode*>c_doc, c_node, 1) 611 c_node._private = NULL 612 tree.END_FOR_EACH_FROM(c_node) 613 else: 614 # create a fresh copy that lxml owns 615 c_doc = tree.xmlCopyDoc(c_doc, 1) 616 if c_doc is NULL: 617 raise MemoryError() 618 619 return _documentFactory(c_doc, parser)