parser.pxi
1 # Parsers for XML and HTML 2 3 from lxml.includes cimport xmlparser 4 from lxml.includes cimport htmlparser 5 6 7 class ParseError(LxmlSyntaxError): 8 """Syntax error while parsing an XML document. 9 10 For compatibility with ElementTree 1.3 and later. 11 """ 12 def __init__(self, message, code, line, column, filename=None): 13 super(_ParseError, self).__init__(message) 14 self.lineno, self.offset = (line, column - 1) 15 self.code = code 16 self.filename = filename 17 18 @property 19 def position(self): 20 return self.lineno, self.offset + 1 21 22 @position.setter 23 def position(self, new_pos): 24 self.lineno, column = new_pos 25 self.offset = column - 1 26 27 cdef object _ParseError = ParseError 28 29 30 class XMLSyntaxError(ParseError): 31 """Syntax error while parsing an XML document. 32 """ 33 34 cdef class ParserError(LxmlError): 35 """Internal lxml parser error. 36 """ 37 38 39 @cython.final 40 @cython.internal 41 cdef class _ParserDictionaryContext: 42 # Global parser context to share the string dictionary. 43 # 44 # This class is a delegate singleton! 45 # 46 # It creates _ParserDictionaryContext objects for each thread to keep thread state, 47 # but those must never be used directly. Always stick to using the static 48 # __GLOBAL_PARSER_CONTEXT as defined below the class. 49 # 50 51 cdef tree.xmlDict* _c_dict 52 cdef _BaseParser _default_parser 53 cdef list _implied_parser_contexts 54 55 def __cinit__(self): 56 self._c_dict = NULL 57 self._implied_parser_contexts = [] 58 59 def __dealloc__(self): 60 if self._c_dict is not NULL: 61 xmlparser.xmlDictFree(self._c_dict) 62 63 cdef void initMainParserContext(self): 64 u"""Put the global context into the thread dictionary of the main 65 thread. To be called once and only in the main thread.""" 66 thread_dict = python.PyThreadState_GetDict() 67 if thread_dict is not NULL: 68 (<dict>thread_dict)[u"_ParserDictionaryContext"] = self 69 70 cdef _ParserDictionaryContext _findThreadParserContext(self): 71 u"Find (or create) the _ParserDictionaryContext object for the current thread" 72 cdef _ParserDictionaryContext context 73 thread_dict = python.PyThreadState_GetDict() 74 if thread_dict is NULL: 75 return self 76 d = <dict>thread_dict 77 result = python.PyDict_GetItem(d, u"_ParserDictionaryContext") 78 if result is not NULL: 79 return <object>result 80 context = <_ParserDictionaryContext>_ParserDictionaryContext.__new__(_ParserDictionaryContext) 81 d[u"_ParserDictionaryContext"] = context 82 return context 83 84 cdef void setDefaultParser(self, _BaseParser parser): 85 u"Set the default parser for the current thread" 86 cdef _ParserDictionaryContext context 87 context = self._findThreadParserContext() 88 context._default_parser = parser 89 90 cdef _BaseParser getDefaultParser(self): 91 u"Return (or create) the default parser of the current thread" 92 cdef _ParserDictionaryContext context 93 context = self._findThreadParserContext() 94 if context._default_parser is None: 95 if self._default_parser is None: 96 self._default_parser = __DEFAULT_XML_PARSER._copy() 97 if context is not self: 98 context._default_parser = self._default_parser._copy() 99 return context._default_parser 100 101 cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default): 102 u"Return the thread-local dict or create a new one if necessary." 103 cdef _ParserDictionaryContext context 104 context = self._findThreadParserContext() 105 if context._c_dict is NULL: 106 # thread dict not yet set up => use default or create a new one 107 if default is not NULL: 108 context._c_dict = default 109 xmlparser.xmlDictReference(default) 110 return default 111 if self._c_dict is NULL: 112 self._c_dict = xmlparser.xmlDictCreate() 113 if context is not self: 114 context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict) 115 return context._c_dict 116 117 cdef void initThreadDictRef(self, tree.xmlDict** c_dict_ref): 118 c_dict = c_dict_ref[0] 119 c_thread_dict = self._getThreadDict(c_dict) 120 if c_dict is c_thread_dict: 121 return 122 if c_dict is not NULL: 123 xmlparser.xmlDictFree(c_dict) 124 c_dict_ref[0] = c_thread_dict 125 xmlparser.xmlDictReference(c_thread_dict) 126 127 cdef void initParserDict(self, xmlparser.xmlParserCtxt* pctxt): 128 u"Assure we always use the same string dictionary." 129 self.initThreadDictRef(&pctxt.dict) 130 pctxt.dictNames = 1 131 132 cdef void initXPathParserDict(self, xpath.xmlXPathContext* pctxt): 133 u"Assure we always use the same string dictionary." 134 self.initThreadDictRef(&pctxt.dict) 135 136 cdef void initDocDict(self, xmlDoc* result): 137 u"Store dict of last object parsed if no shared dict yet" 138 # XXX We also free the result dict here if there already was one. 139 # This case should only occur for new documents with empty dicts, 140 # otherwise we'd free data that's in use => segfault 141 self.initThreadDictRef(&result.dict) 142 143 cdef _ParserContext findImpliedContext(self): 144 u"""Return any current implied xml parser context for the current 145 thread. This is used when the resolver functions are called 146 with an xmlParserCtxt that was generated from within libxml2 147 (i.e. without a _ParserContext) - which happens when parsing 148 schema and xinclude external references.""" 149 cdef _ParserDictionaryContext context 150 cdef _ParserContext implied_context 151 152 # see if we have a current implied parser 153 context = self._findThreadParserContext() 154 if context._implied_parser_contexts: 155 implied_context = context._implied_parser_contexts[-1] 156 return implied_context 157 return None 158 159 cdef void pushImpliedContextFromParser(self, _BaseParser parser): 160 u"Push a new implied context object taken from the parser." 161 if parser is not None: 162 self.pushImpliedContext(parser._getParserContext()) 163 else: 164 self.pushImpliedContext(None) 165 166 cdef void pushImpliedContext(self, _ParserContext parser_context): 167 u"Push a new implied context object." 168 cdef _ParserDictionaryContext context 169 context = self._findThreadParserContext() 170 context._implied_parser_contexts.append(parser_context) 171 172 cdef void popImpliedContext(self): 173 u"Pop the current implied context object." 174 cdef _ParserDictionaryContext context 175 context = self._findThreadParserContext() 176 context._implied_parser_contexts.pop() 177 178 cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext() 179 __GLOBAL_PARSER_CONTEXT.initMainParserContext() 180 181 ############################################################ 182 ## support for Python unicode I/O 183 ############################################################ 184 185 # name of Python Py_UNICODE encoding as known to libxml2 186 cdef const_char* _PY_UNICODE_ENCODING = NULL 187 188 cdef int _setupPythonUnicode() except -1: 189 u"""Sets _PY_UNICODE_ENCODING to the internal encoding name of Python unicode 190 strings if libxml2 supports reading native Python unicode. This depends 191 on iconv and the local Python installation, so we simply check if we find 192 a matching encoding handler. 193 """ 194 cdef tree.xmlCharEncodingHandler* enchandler 195 cdef Py_ssize_t l 196 cdef const_char* enc 197 cdef Py_UNICODE *uchars = [c'<', c't', c'e', c's', c't', c'/', c'>'] 198 cdef const_xmlChar* buffer = <const_xmlChar*>uchars 199 # apparently, libxml2 can't detect UTF-16 on some systems 200 if (buffer[0] == c'<' and buffer[1] == c'\0' and 201 buffer[2] == c't' and buffer[3] == c'\0'): 202 enc = "UTF-16LE" 203 elif (buffer[0] == c'\0' and buffer[1] == c'<' and 204 buffer[2] == c'\0' and buffer[3] == c't'): 205 enc = "UTF-16BE" 206 else: 207 # let libxml2 give it a try 208 enc = _findEncodingName(buffer, sizeof(Py_UNICODE) * 7) 209 if enc is NULL: 210 # not my fault, it's YOUR broken system :) 211 return 0 212 enchandler = tree.xmlFindCharEncodingHandler(enc) 213 if enchandler is not NULL: 214 global _PY_UNICODE_ENCODING 215 tree.xmlCharEncCloseFunc(enchandler) 216 _PY_UNICODE_ENCODING = enc 217 return 0 218 219 cdef const_char* _findEncodingName(const_xmlChar* buffer, int size): 220 u"Work around bug in libxml2: find iconv name of encoding on our own." 221 cdef tree.xmlCharEncoding enc 222 enc = tree.xmlDetectCharEncoding(buffer, size) 223 if enc == tree.XML_CHAR_ENCODING_UTF16LE: 224 if size >= 4 and (buffer[0] == <const_xmlChar>'\xFF' and 225 buffer[1] == <const_xmlChar>'\xFE' and 226 buffer[2] == 0 and buffer[3] == 0): 227 return "UTF-32LE" # according to BOM 228 else: 229 return "UTF-16LE" 230 elif enc == tree.XML_CHAR_ENCODING_UTF16BE: 231 return "UTF-16BE" 232 elif enc == tree.XML_CHAR_ENCODING_UCS4LE: 233 return "UCS-4LE" 234 elif enc == tree.XML_CHAR_ENCODING_UCS4BE: 235 return "UCS-4BE" 236 elif enc == tree.XML_CHAR_ENCODING_NONE: 237 return NULL 238 else: 239 # returns a constant char*, no need to free it 240 return tree.xmlGetCharEncodingName(enc) 241 242 _setupPythonUnicode() 243 244 ############################################################ 245 ## support for file-like objects 246 ############################################################ 247 248 @cython.final 249 @cython.internal 250 cdef class _FileReaderContext: 251 cdef object _filelike 252 cdef object _encoding 253 cdef object _url 254 cdef object _bytes 255 cdef _ExceptionContext _exc_context 256 cdef Py_ssize_t _bytes_read 257 cdef char* _c_url 258 cdef bint _close_file_after_read 259 260 def __cinit__(self, filelike, exc_context not None, url, encoding=None, bint close_file=False): 261 self._exc_context = exc_context 262 self._filelike = filelike 263 self._close_file_after_read = close_file 264 self._encoding = encoding 265 if url is None: 266 self._c_url = NULL 267 else: 268 url = _encodeFilename(url) 269 self._c_url = _cstr(url) 270 self._url = url 271 self._bytes = b'' 272 self._bytes_read = 0 273 274 cdef _close_file(self): 275 if self._filelike is None or not self._close_file_after_read: 276 return 277 try: 278 close = self._filelike.close 279 except AttributeError: 280 close = None 281 finally: 282 self._filelike = None 283 if close is not None: 284 close() 285 286 cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self): 287 cdef stdio.FILE* c_stream 288 cdef xmlparser.xmlParserInputBuffer* c_buffer 289 c_buffer = xmlparser.xmlAllocParserInputBuffer(0) 290 c_stream = python.PyFile_AsFile(self._filelike) 291 if c_stream is NULL: 292 c_buffer.readcallback = _readFilelikeParser 293 c_buffer.context = <python.PyObject*>self 294 else: 295 c_buffer.readcallback = _readFileParser 296 c_buffer.context = c_stream 297 return c_buffer 298 299 cdef xmlparser.xmlParserInput* _createParserInput( 300 self, xmlparser.xmlParserCtxt* ctxt): 301 cdef xmlparser.xmlParserInputBuffer* c_buffer 302 c_buffer = self._createParserInputBuffer() 303 return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0) 304 305 cdef tree.xmlDtd* _readDtd(self): 306 cdef xmlparser.xmlParserInputBuffer* c_buffer 307 c_buffer = self._createParserInputBuffer() 308 with nogil: 309 return xmlparser.xmlIOParseDTD(NULL, c_buffer, 0) 310 311 cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options): 312 cdef xmlDoc* result 313 cdef char* c_encoding 314 cdef stdio.FILE* c_stream 315 cdef xmlparser.xmlInputReadCallback c_read_callback 316 cdef xmlparser.xmlInputCloseCallback c_close_callback 317 cdef void* c_callback_context 318 319 if self._encoding is None: 320 c_encoding = NULL 321 else: 322 c_encoding = _cstr(self._encoding) 323 324 c_stream = python.PyFile_AsFile(self._filelike) 325 if c_stream is NULL: 326 c_read_callback = _readFilelikeParser 327 c_callback_context = <python.PyObject*>self 328 else: 329 c_read_callback = _readFileParser 330 c_callback_context = c_stream 331 332 orig_options = ctxt.options 333 with nogil: 334 if ctxt.html: 335 result = htmlparser.htmlCtxtReadIO( 336 ctxt, c_read_callback, NULL, c_callback_context, 337 self._c_url, c_encoding, options) 338 if result is not NULL: 339 if _fixHtmlDictNames(ctxt.dict, result) < 0: 340 tree.xmlFreeDoc(result) 341 result = NULL 342 else: 343 result = xmlparser.xmlCtxtReadIO( 344 ctxt, c_read_callback, NULL, c_callback_context, 345 self._c_url, c_encoding, options) 346 ctxt.options = orig_options # work around libxml2 problem 347 try: 348 self._close_file() 349 except: 350 self._exc_context._store_raised() 351 finally: 352 return result # swallow any exceptions 353 354 cdef int copyToBuffer(self, char* c_buffer, int c_requested): 355 cdef int c_byte_count = 0 356 cdef char* c_start 357 cdef Py_ssize_t byte_count, remaining 358 if self._bytes_read < 0: 359 return 0 360 try: 361 byte_count = python.PyBytes_GET_SIZE(self._bytes) 362 remaining = byte_count - self._bytes_read 363 while c_requested > remaining: 364 c_start = _cstr(self._bytes) + self._bytes_read 365 cstring_h.memcpy(c_buffer, c_start, remaining) 366 c_byte_count += remaining 367 c_buffer += remaining 368 c_requested -= remaining 369 370 self._bytes = self._filelike.read(c_requested) 371 if not isinstance(self._bytes, bytes): 372 if isinstance(self._bytes, unicode): 373 if self._encoding is None: 374 self._bytes = (<unicode>self._bytes).encode('utf8') 375 else: 376 self._bytes = python.PyUnicode_AsEncodedString( 377 self._bytes, _cstr(self._encoding), NULL) 378 else: 379 self._close_file() 380 raise TypeError, \ 381 u"reading from file-like objects must return byte strings or unicode strings" 382 383 remaining = python.PyBytes_GET_SIZE(self._bytes) 384 if remaining == 0: 385 self._bytes_read = -1 386 self._close_file() 387 return c_byte_count 388 self._bytes_read = 0 389 390 if c_requested > 0: 391 c_start = _cstr(self._bytes) + self._bytes_read 392 cstring_h.memcpy(c_buffer, c_start, c_requested) 393 c_byte_count += c_requested 394 self._bytes_read += c_requested 395 except: 396 c_byte_count = -1 397 self._exc_context._store_raised() 398 try: 399 self._close_file() 400 except: 401 self._exc_context._store_raised() 402 finally: 403 return c_byte_count # swallow any exceptions 404 405 cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) with gil: 406 return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size) 407 408 cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) nogil: 409 return stdio.fread(c_buffer, 1, c_size, <stdio.FILE*>ctxt) 410 411 ############################################################ 412 ## support for custom document loaders 413 ############################################################ 414 415 cdef xmlparser.xmlParserInput* _local_resolver(const_char* c_url, const_char* c_pubid, 416 xmlparser.xmlParserCtxt* c_context) with gil: 417 cdef _ResolverContext context 418 cdef xmlparser.xmlParserInput* c_input 419 cdef _InputDocument doc_ref 420 cdef _FileReaderContext file_context 421 # if there is no _ParserContext associated with the xmlParserCtxt 422 # passed, check to see if the thread state object has an implied 423 # context. 424 if c_context._private is not NULL: 425 context = <_ResolverContext>c_context._private 426 else: 427 context = __GLOBAL_PARSER_CONTEXT.findImpliedContext() 428 429 if context is None: 430 if __DEFAULT_ENTITY_LOADER is NULL: 431 return NULL 432 with nogil: 433 # free the GIL as we might do serious I/O here (e.g. HTTP) 434 c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context) 435 return c_input 436 437 try: 438 if c_url is NULL: 439 url = None 440 else: 441 # parsing a related document (DTD etc.) => UTF-8 encoded URL? 442 url = _decodeFilename(<const_xmlChar*>c_url) 443 if c_pubid is NULL: 444 pubid = None 445 else: 446 pubid = funicode(<const_xmlChar*>c_pubid) # always UTF-8 447 448 doc_ref = context._resolvers.resolve(url, pubid, context) 449 except: 450 context._store_raised() 451 return NULL 452 453 if doc_ref is not None: 454 if doc_ref._type == PARSER_DATA_STRING: 455 data = doc_ref._data_bytes 456 filename = doc_ref._filename 457 if not filename: 458 filename = None 459 elif not isinstance(filename, bytes): 460 # most likely a text URL 461 filename = filename.encode('utf8') 462 if not isinstance(filename, bytes): 463 filename = None 464 465 c_input = xmlparser.xmlNewInputStream(c_context) 466 if c_input is not NULL: 467 if filename is not None: 468 c_input.filename = <char *>tree.xmlStrdup(_xcstr(filename)) 469 c_input.base = _xcstr(data) 470 c_input.length = python.PyBytes_GET_SIZE(data) 471 c_input.cur = c_input.base 472 c_input.end = c_input.base + c_input.length 473 elif doc_ref._type == PARSER_DATA_FILENAME: 474 data = None 475 c_filename = _cstr(doc_ref._filename) 476 with nogil: 477 # free the GIL as we might do serious I/O here 478 c_input = xmlparser.xmlNewInputFromFile( 479 c_context, c_filename) 480 elif doc_ref._type == PARSER_DATA_FILE: 481 file_context = _FileReaderContext(doc_ref._file, context, url, 482 None, doc_ref._close_file) 483 c_input = file_context._createParserInput(c_context) 484 data = file_context 485 else: 486 data = None 487 c_input = NULL 488 489 if data is not None: 490 context._storage.add(data) 491 if c_input is not NULL: 492 return c_input 493 494 if __DEFAULT_ENTITY_LOADER is NULL: 495 return NULL 496 497 with nogil: 498 # free the GIL as we might do serious I/O here (e.g. HTTP) 499 c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context) 500 return c_input 501 502 cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER 503 __DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader() 504 505 506 cdef xmlparser.xmlExternalEntityLoader _register_document_loader() nogil: 507 cdef xmlparser.xmlExternalEntityLoader old = xmlparser.xmlGetExternalEntityLoader() 508 xmlparser.xmlSetExternalEntityLoader(<xmlparser.xmlExternalEntityLoader>_local_resolver) 509 return old 510 511 cdef void _reset_document_loader(xmlparser.xmlExternalEntityLoader old) nogil: 512 xmlparser.xmlSetExternalEntityLoader(old) 513 514 515 ############################################################ 516 ## Parsers 517 ############################################################ 518 519 @cython.no_gc_clear # May have to call "self._validator.disconnect()" on dealloc. 520 @cython.internal 521 cdef class _ParserContext(_ResolverContext): 522 cdef _ErrorLog _error_log 523 cdef _ParserSchemaValidationContext _validator 524 cdef xmlparser.xmlParserCtxt* _c_ctxt 525 cdef xmlparser.xmlExternalEntityLoader _orig_loader 526 cdef python.PyThread_type_lock _lock 527 cdef _Document _doc 528 cdef bint _collect_ids 529 530 def __cinit__(self): 531 self._c_ctxt = NULL 532 self._collect_ids = True 533 if not config.ENABLE_THREADING: 534 self._lock = NULL 535 else: 536 self._lock = python.PyThread_allocate_lock() 537 self._error_log = _ErrorLog() 538 539 def __dealloc__(self): 540 if config.ENABLE_THREADING and self._lock is not NULL: 541 python.PyThread_free_lock(self._lock) 542 self._lock = NULL 543 if self._c_ctxt is not NULL: 544 if <void*>self._validator is not NULL and self._validator is not None: 545 # If the parser was not closed correctly (e.g. interrupted iterparse()), 546 # and the schema validator wasn't freed and cleaned up yet, the libxml2 SAX 547 # validator plug might still be in place, which will make xmlFreeParserCtxt() 548 # crash when trying to xmlFree() a static SAX handler. 549 # Thus, make sure we disconnect the handler interceptor here at the latest. 550 self._validator.disconnect() 551 xmlparser.xmlFreeParserCtxt(self._c_ctxt) 552 553 cdef _ParserContext _copy(self): 554 cdef _ParserContext context 555 context = self.__class__() 556 context._collect_ids = self._collect_ids 557 context._validator = self._validator.copy() 558 _initParserContext(context, self._resolvers._copy(), NULL) 559 return context 560 561 cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt): 562 self._c_ctxt = c_ctxt 563 c_ctxt._private = <void*>self 564 565 cdef void _resetParserContext(self): 566 if self._c_ctxt is not NULL: 567 if self._c_ctxt.html: 568 htmlparser.htmlCtxtReset(self._c_ctxt) 569 self._c_ctxt.disableSAX = 0 # work around bug in libxml2 570 else: 571 xmlparser.xmlClearParserCtxt(self._c_ctxt) 572 # work around bug in libxml2 [2.9.10 .. 2.9.14]: 573 # https://gitlab.gnome.org/GNOME/libxml2/-/issues/378 574 self._c_ctxt.nsNr = 0 575 576 cdef int prepare(self, bint set_document_loader=True) except -1: 577 cdef int result 578 if config.ENABLE_THREADING and self._lock is not NULL: 579 with nogil: 580 result = python.PyThread_acquire_lock( 581 self._lock, python.WAIT_LOCK) 582 if result == 0: 583 raise ParserError, u"parser locking failed" 584 self._error_log.clear() 585 self._doc = None 586 self._c_ctxt.sax.serror = _receiveParserError 587 self._orig_loader = _register_document_loader() if set_document_loader else NULL 588 if self._validator is not None: 589 self._validator.connect(self._c_ctxt, self._error_log) 590 return 0 591 592 cdef int cleanup(self) except -1: 593 if self._orig_loader is not NULL: 594 _reset_document_loader(self._orig_loader) 595 try: 596 if self._validator is not None: 597 self._validator.disconnect() 598 self._resetParserContext() 599 self.clear() 600 self._doc = None 601 self._c_ctxt.sax.serror = NULL 602 finally: 603 if config.ENABLE_THREADING and self._lock is not NULL: 604 python.PyThread_release_lock(self._lock) 605 return 0 606 607 cdef object _handleParseResult(self, _BaseParser parser, 608 xmlDoc* result, filename): 609 c_doc = self._handleParseResultDoc(parser, result, filename) 610 if self._doc is not None and self._doc._c_doc is c_doc: 611 return self._doc 612 else: 613 return _documentFactory(c_doc, parser) 614 615 cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser, 616 xmlDoc* result, filename) except NULL: 617 recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER 618 return _handleParseResult(self, self._c_ctxt, result, 619 filename, recover, 620 free_doc=self._doc is None) 621 622 cdef _initParserContext(_ParserContext context, 623 _ResolverRegistry resolvers, 624 xmlparser.xmlParserCtxt* c_ctxt): 625 _initResolverContext(context, resolvers) 626 if c_ctxt is not NULL: 627 context._initParserContext(c_ctxt) 628 629 cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, xmlerror.xmlError* error) with gil: 630 (<_ParserContext>_parser_context._private)._error_log._receive(error) 631 632 cdef void _receiveParserError(void* c_context, xmlerror.xmlError* error) nogil: 633 if __DEBUG: 634 if c_context is NULL or (<xmlparser.xmlParserCtxt*>c_context)._private is NULL: 635 _forwardError(NULL, error) 636 else: 637 _forwardParserError(<xmlparser.xmlParserCtxt*>c_context, error) 638 639 cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename, 640 _ErrorLog error_log) except -1: 641 if filename is not None and \ 642 ctxt.lastError.domain == xmlerror.XML_FROM_IO: 643 if isinstance(filename, bytes): 644 filename = _decodeFilenameWithLength( 645 <bytes>filename, len(<bytes>filename)) 646 if ctxt.lastError.message is not NULL: 647 try: 648 message = ctxt.lastError.message.decode('utf-8') 649 except UnicodeDecodeError: 650 # the filename may be in there => play it safe 651 message = ctxt.lastError.message.decode('iso8859-1') 652 message = f"Error reading file '{filename}': {message.strip()}" 653 else: 654 message = f"Error reading '{filename}'" 655 raise IOError, message 656 elif error_log: 657 raise error_log._buildParseException( 658 XMLSyntaxError, u"Document is not well formed") 659 elif ctxt.lastError.message is not NULL: 660 message = ctxt.lastError.message.strip() 661 code = ctxt.lastError.code 662 line = ctxt.lastError.line 663 column = ctxt.lastError.int2 664 if ctxt.lastError.line > 0: 665 message = f"line {line}: {message}" 666 raise XMLSyntaxError(message, code, line, column, filename) 667 else: 668 raise XMLSyntaxError(None, xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0, 669 filename) 670 671 cdef xmlDoc* _handleParseResult(_ParserContext context, 672 xmlparser.xmlParserCtxt* c_ctxt, 673 xmlDoc* result, filename, 674 bint recover, bint free_doc) except NULL: 675 cdef bint well_formed 676 if result is not NULL: 677 __GLOBAL_PARSER_CONTEXT.initDocDict(result) 678 679 if c_ctxt.myDoc is not NULL: 680 if c_ctxt.myDoc is not result: 681 __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc) 682 tree.xmlFreeDoc(c_ctxt.myDoc) 683 c_ctxt.myDoc = NULL 684 685 if result is not NULL: 686 if (context._validator is not None and 687 not context._validator.isvalid()): 688 well_formed = 0 # actually not 'valid', but anyway ... 689 elif (not c_ctxt.wellFormed and not c_ctxt.html and 690 c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and 691 [1 for error in context._error_log 692 if error.type == ErrorTypes.ERR_INVALID_CHAR]): 693 # An encoding error occurred and libxml2 switched from UTF-8 694 # input to (undecoded) Latin-1, at some arbitrary point in the 695 # document. Better raise an error than allowing for a broken 696 # tree with mixed encodings. 697 well_formed = 0 698 elif recover or (c_ctxt.wellFormed and 699 c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR): 700 well_formed = 1 701 elif not c_ctxt.replaceEntities and not c_ctxt.validate \ 702 and context is not None: 703 # in this mode, we ignore errors about undefined entities 704 for error in context._error_log.filter_from_errors(): 705 if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \ 706 error.type != ErrorTypes.ERR_UNDECLARED_ENTITY: 707 well_formed = 0 708 break 709 else: 710 well_formed = 1 711 else: 712 well_formed = 0 713 714 if not well_formed: 715 if free_doc: 716 tree.xmlFreeDoc(result) 717 result = NULL 718 719 if context is not None and context._has_raised(): 720 if result is not NULL: 721 if free_doc: 722 tree.xmlFreeDoc(result) 723 result = NULL 724 context._raise_if_stored() 725 726 if result is NULL: 727 if context is not None: 728 _raiseParseError(c_ctxt, filename, context._error_log) 729 else: 730 _raiseParseError(c_ctxt, filename, None) 731 else: 732 if result.URL is NULL and filename is not None: 733 result.URL = tree.xmlStrdup(_xcstr(filename)) 734 if result.encoding is NULL: 735 result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8") 736 737 if context._validator is not None and \ 738 context._validator._add_default_attributes: 739 # we currently need to do this here as libxml2 does not 740 # support inserting default attributes during parse-time 741 # validation 742 context._validator.inject_default_attributes(result) 743 744 return result 745 746 cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) nogil: 747 cdef xmlNode* c_node 748 if c_doc is NULL: 749 return 0 750 c_node = c_doc.children 751 tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1) 752 if c_node.type == tree.XML_ELEMENT_NODE: 753 if _fixHtmlDictNodeNames(c_dict, c_node) < 0: 754 return -1 755 tree.END_FOR_EACH_ELEMENT_FROM(c_node) 756 return 0 757 758 cdef int _fixHtmlDictSubtreeNames(tree.xmlDict* c_dict, xmlDoc* c_doc, 759 xmlNode* c_start_node) nogil: 760 """ 761 Move names to the dict, iterating in document order, starting at 762 c_start_node. This is used in incremental parsing after each chunk. 763 """ 764 cdef xmlNode* c_node 765 if not c_doc: 766 return 0 767 if not c_start_node: 768 return _fixHtmlDictNames(c_dict, c_doc) 769 c_node = c_start_node 770 tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1) 771 if c_node.type == tree.XML_ELEMENT_NODE: 772 if _fixHtmlDictNodeNames(c_dict, c_node) < 0: 773 return -1 774 tree.END_FOR_EACH_ELEMENT_FROM(c_node) 775 return 0 776 777 cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict, 778 xmlNode* c_node) nogil: 779 cdef xmlNode* c_attr 780 c_name = tree.xmlDictLookup(c_dict, c_node.name, -1) 781 if c_name is NULL: 782 return -1 783 if c_name is not c_node.name: 784 tree.xmlFree(<char*>c_node.name) 785 c_node.name = c_name 786 c_attr = <xmlNode*>c_node.properties 787 while c_attr is not NULL: 788 c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1) 789 if c_name is NULL: 790 return -1 791 if c_name is not c_attr.name: 792 tree.xmlFree(<char*>c_attr.name) 793 c_attr.name = c_name 794 c_attr = c_attr.next 795 return 0 796 797 @cython.internal 798 cdef class _BaseParser: 799 cdef ElementClassLookup _class_lookup 800 cdef _ResolverRegistry _resolvers 801 cdef _ParserContext _parser_context 802 cdef _ParserContext _push_parser_context 803 cdef int _parse_options 804 cdef bint _for_html 805 cdef bint _remove_comments 806 cdef bint _remove_pis 807 cdef bint _strip_cdata 808 cdef bint _collect_ids 809 cdef XMLSchema _schema 810 cdef bytes _filename 811 cdef readonly object target 812 cdef object _default_encoding 813 cdef tuple _events_to_collect # (event_types, tag) 814 815 def __init__(self, int parse_options, bint for_html, XMLSchema schema, 816 remove_comments, remove_pis, strip_cdata, collect_ids, 817 target, encoding): 818 cdef tree.xmlCharEncodingHandler* enchandler 819 cdef int c_encoding 820 if not isinstance(self, (XMLParser, HTMLParser)): 821 raise TypeError, u"This class cannot be instantiated" 822 823 self._parse_options = parse_options 824 self.target = target 825 self._for_html = for_html 826 self._remove_comments = remove_comments 827 self._remove_pis = remove_pis 828 self._strip_cdata = strip_cdata 829 self._collect_ids = collect_ids 830 self._schema = schema 831 832 self._resolvers = _ResolverRegistry() 833 834 if encoding is None: 835 self._default_encoding = None 836 else: 837 encoding = _utf8(encoding) 838 enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding)) 839 if enchandler is NULL: 840 raise LookupError, f"unknown encoding: '{encoding}'" 841 tree.xmlCharEncCloseFunc(enchandler) 842 self._default_encoding = encoding 843 844 cdef _setBaseURL(self, base_url): 845 self._filename = _encodeFilename(base_url) 846 847 cdef _collectEvents(self, event_types, tag): 848 if event_types is None: 849 event_types = () 850 else: 851 event_types = tuple(set(event_types)) 852 _buildParseEventFilter(event_types) # purely for validation 853 self._events_to_collect = (event_types, tag) 854 855 cdef _ParserContext _getParserContext(self): 856 cdef xmlparser.xmlParserCtxt* pctxt 857 if self._parser_context is None: 858 self._parser_context = self._createContext(self.target, None) 859 self._parser_context._collect_ids = self._collect_ids 860 if self._schema is not None: 861 self._parser_context._validator = \ 862 self._schema._newSaxValidator( 863 self._parse_options & xmlparser.XML_PARSE_DTDATTR) 864 pctxt = self._newParserCtxt() 865 _initParserContext(self._parser_context, self._resolvers, pctxt) 866 self._configureSaxContext(pctxt) 867 return self._parser_context 868 869 cdef _ParserContext _getPushParserContext(self): 870 cdef xmlparser.xmlParserCtxt* pctxt 871 if self._push_parser_context is None: 872 self._push_parser_context = self._createContext( 873 self.target, self._events_to_collect) 874 self._push_parser_context._collect_ids = self._collect_ids 875 if self._schema is not None: 876 self._push_parser_context._validator = \ 877 self._schema._newSaxValidator( 878 self._parse_options & xmlparser.XML_PARSE_DTDATTR) 879 pctxt = self._newPushParserCtxt() 880 _initParserContext( 881 self._push_parser_context, self._resolvers, pctxt) 882 self._configureSaxContext(pctxt) 883 return self._push_parser_context 884 885 cdef _ParserContext _createContext(self, target, events_to_collect): 886 cdef _SaxParserContext sax_context 887 if target is not None: 888 sax_context = _TargetParserContext(self) 889 (<_TargetParserContext>sax_context)._setTarget(target) 890 elif events_to_collect: 891 sax_context = _SaxParserContext(self) 892 else: 893 # nothing special to configure 894 return _ParserContext() 895 if events_to_collect: 896 events, tag = events_to_collect 897 sax_context._setEventFilter(events, tag) 898 return sax_context 899 900 @cython.final 901 cdef int _configureSaxContext(self, xmlparser.xmlParserCtxt* pctxt) except -1: 902 if self._remove_comments: 903 pctxt.sax.comment = NULL 904 if self._remove_pis: 905 pctxt.sax.processingInstruction = NULL 906 if self._strip_cdata: 907 # hard switch-off for CDATA nodes => makes them plain text 908 pctxt.sax.cdataBlock = NULL 909 910 cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1: 911 cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax 912 if sax is not NULL and sax.initialized and sax.initialized != xmlparser.XML_SAX2_MAGIC: 913 # need to extend SAX1 context to SAX2 to get proper error reports 914 if <xmlparser.xmlSAXHandlerV1*>sax is &htmlparser.htmlDefaultSAXHandler: 915 sax = <xmlparser.xmlSAXHandler*> tree.xmlMalloc(sizeof(xmlparser.xmlSAXHandler)) 916 if sax is NULL: 917 raise MemoryError() 918 cstring_h.memcpy(sax, &htmlparser.htmlDefaultSAXHandler, 919 sizeof(htmlparser.htmlDefaultSAXHandler)) 920 c_ctxt.sax = sax 921 sax.initialized = xmlparser.XML_SAX2_MAGIC 922 sax.serror = _receiveParserError 923 sax.startElementNs = NULL 924 sax.endElementNs = NULL 925 sax._private = NULL 926 return 0 927 928 cdef xmlparser.xmlParserCtxt* _newParserCtxt(self) except NULL: 929 cdef xmlparser.xmlParserCtxt* c_ctxt 930 if self._for_html: 931 c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5) 932 if c_ctxt is not NULL: 933 self._registerHtmlErrorHandler(c_ctxt) 934 else: 935 c_ctxt = xmlparser.xmlNewParserCtxt() 936 if c_ctxt is NULL: 937 raise MemoryError 938 c_ctxt.sax.startDocument = _initSaxDocument 939 return c_ctxt 940 941 cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self) except NULL: 942 cdef xmlparser.xmlParserCtxt* c_ctxt 943 cdef char* c_filename = _cstr(self._filename) if self._filename is not None else NULL 944 if self._for_html: 945 c_ctxt = htmlparser.htmlCreatePushParserCtxt( 946 NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE) 947 if c_ctxt is not NULL: 948 self._registerHtmlErrorHandler(c_ctxt) 949 htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options) 950 else: 951 c_ctxt = xmlparser.xmlCreatePushParserCtxt( 952 NULL, NULL, NULL, 0, c_filename) 953 if c_ctxt is not NULL: 954 xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options) 955 if c_ctxt is NULL: 956 raise MemoryError() 957 c_ctxt.sax.startDocument = _initSaxDocument 958 return c_ctxt 959 960 @property 961 def error_log(self): 962 """The error log of the last parser run. 963 """ 964 cdef _ParserContext context 965 context = self._getParserContext() 966 return context._error_log.copy() 967 968 @property 969 def resolvers(self): 970 """The custom resolver registry of this parser.""" 971 return self._resolvers 972 973 @property 974 def version(self): 975 """The version of the underlying XML parser.""" 976 return u"libxml2 %d.%d.%d" % LIBXML_VERSION 977 978 def setElementClassLookup(self, ElementClassLookup lookup = None): 979 u":deprecated: use ``parser.set_element_class_lookup(lookup)`` instead." 980 self.set_element_class_lookup(lookup) 981 982 def set_element_class_lookup(self, ElementClassLookup lookup = None): 983 u"""set_element_class_lookup(self, lookup = None) 984 985 Set a lookup scheme for element classes generated from this parser. 986 987 Reset it by passing None or nothing. 988 """ 989 self._class_lookup = lookup 990 991 cdef _BaseParser _copy(self): 992 u"Create a new parser with the same configuration." 993 cdef _BaseParser parser 994 parser = self.__class__() 995 parser._parse_options = self._parse_options 996 parser._for_html = self._for_html 997 parser._remove_comments = self._remove_comments 998 parser._remove_pis = self._remove_pis 999 parser._strip_cdata = self._strip_cdata 1000 parser._filename = self._filename 1001 parser._resolvers = self._resolvers 1002 parser.target = self.target 1003 parser._class_lookup = self._class_lookup 1004 parser._default_encoding = self._default_encoding 1005 parser._schema = self._schema 1006 parser._events_to_collect = self._events_to_collect 1007 return parser 1008 1009 def copy(self): 1010 u"""copy(self) 1011 1012 Create a new parser with the same configuration. 1013 """ 1014 return self._copy() 1015 1016 def makeelement(self, _tag, attrib=None, nsmap=None, **_extra): 1017 u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra) 1018 1019 Creates a new element associated with this parser. 1020 """ 1021 return _makeElement(_tag, NULL, None, self, None, None, 1022 attrib, nsmap, _extra) 1023 1024 # internal parser methods 1025 1026 cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL: 1027 u"""Parse unicode document, share dictionary if possible. 1028 """ 1029 cdef _ParserContext context 1030 cdef xmlDoc* result 1031 cdef xmlparser.xmlParserCtxt* pctxt 1032 cdef Py_ssize_t py_buffer_len 1033 cdef int buffer_len, c_kind 1034 cdef const_char* c_text 1035 cdef const_char* c_encoding = _PY_UNICODE_ENCODING 1036 cdef bint is_pep393_string = ( 1037 python.PEP393_ENABLED and python.PyUnicode_IS_READY(utext)) 1038 if is_pep393_string: 1039 c_text = <const_char*>python.PyUnicode_DATA(utext) 1040 py_buffer_len = python.PyUnicode_GET_LENGTH(utext) 1041 c_kind = python.PyUnicode_KIND(utext) 1042 if c_kind == 1: 1043 c_encoding = 'ISO-8859-1' 1044 elif c_kind == 2: 1045 py_buffer_len *= 2 1046 if python.PY_BIG_ENDIAN: 1047 c_encoding = 'UTF-16BE' # actually UCS-2 1048 else: 1049 c_encoding = 'UTF-16LE' # actually UCS-2 1050 elif c_kind == 4: 1051 py_buffer_len *= 4 1052 if python.PY_BIG_ENDIAN: 1053 c_encoding = 'UCS-4BE' 1054 else: 1055 c_encoding = 'UCS-4LE' 1056 else: 1057 assert False, f"Illegal Unicode kind {c_kind}" 1058 else: 1059 py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext) 1060 c_text = python.PyUnicode_AS_DATA(utext) 1061 assert 0 <= py_buffer_len <= limits.INT_MAX 1062 buffer_len = py_buffer_len 1063 1064 context = self._getParserContext() 1065 context.prepare() 1066 try: 1067 pctxt = context._c_ctxt 1068 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) 1069 orig_options = pctxt.options 1070 with nogil: 1071 if self._for_html: 1072 result = htmlparser.htmlCtxtReadMemory( 1073 pctxt, c_text, buffer_len, c_filename, c_encoding, 1074 self._parse_options) 1075 if result is not NULL: 1076 if _fixHtmlDictNames(pctxt.dict, result) < 0: 1077 tree.xmlFreeDoc(result) 1078 result = NULL 1079 else: 1080 result = xmlparser.xmlCtxtReadMemory( 1081 pctxt, c_text, buffer_len, c_filename, c_encoding, 1082 self._parse_options) 1083 pctxt.options = orig_options # work around libxml2 problem 1084 1085 return context._handleParseResultDoc(self, result, None) 1086 finally: 1087 context.cleanup() 1088 1089 cdef xmlDoc* _parseDoc(self, char* c_text, int c_len, 1090 char* c_filename) except NULL: 1091 u"""Parse document, share dictionary if possible. 1092 """ 1093 cdef _ParserContext context 1094 cdef xmlDoc* result 1095 cdef xmlparser.xmlParserCtxt* pctxt 1096 cdef char* c_encoding 1097 cdef tree.xmlCharEncoding enc 1098 context = self._getParserContext() 1099 context.prepare() 1100 try: 1101 pctxt = context._c_ctxt 1102 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) 1103 1104 if self._default_encoding is None: 1105 c_encoding = NULL 1106 # libxml2 (at least 2.9.3) does not recognise UTF-32 BOMs 1107 # NOTE: limit to problematic cases because it changes character offsets 1108 if c_len >= 4 and (c_text[0] == '\xFF' and c_text[1] == '\xFE' and 1109 c_text[2] == 0 and c_text[3] == 0): 1110 c_encoding = "UTF-32LE" 1111 c_text += 4 1112 c_len -= 4 1113 elif c_len >= 4 and (c_text[0] == 0 and c_text[1] == 0 and 1114 c_text[2] == '\xFE' and c_text[3] == '\xFF'): 1115 c_encoding = "UTF-32BE" 1116 c_text += 4 1117 c_len -= 4 1118 else: 1119 # no BOM => try to determine encoding 1120 enc = tree.xmlDetectCharEncoding(<const_xmlChar*>c_text, c_len) 1121 if enc == tree.XML_CHAR_ENCODING_UCS4LE: 1122 c_encoding = 'UTF-32LE' 1123 elif enc == tree.XML_CHAR_ENCODING_UCS4BE: 1124 c_encoding = 'UTF-32BE' 1125 else: 1126 c_encoding = _cstr(self._default_encoding) 1127 1128 orig_options = pctxt.options 1129 with nogil: 1130 if self._for_html: 1131 result = htmlparser.htmlCtxtReadMemory( 1132 pctxt, c_text, c_len, c_filename, 1133 c_encoding, self._parse_options) 1134 if result is not NULL: 1135 if _fixHtmlDictNames(pctxt.dict, result) < 0: 1136 tree.xmlFreeDoc(result) 1137 result = NULL 1138 else: 1139 result = xmlparser.xmlCtxtReadMemory( 1140 pctxt, c_text, c_len, c_filename, 1141 c_encoding, self._parse_options) 1142 pctxt.options = orig_options # work around libxml2 problem 1143 1144 return context._handleParseResultDoc(self, result, None) 1145 finally: 1146 context.cleanup() 1147 1148 cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL: 1149 cdef _ParserContext context 1150 cdef xmlDoc* result 1151 cdef xmlparser.xmlParserCtxt* pctxt 1152 cdef char* c_encoding 1153 result = NULL 1154 1155 context = self._getParserContext() 1156 context.prepare() 1157 try: 1158 pctxt = context._c_ctxt 1159 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) 1160 1161 if self._default_encoding is None: 1162 c_encoding = NULL 1163 else: 1164 c_encoding = _cstr(self._default_encoding) 1165 1166 orig_options = pctxt.options 1167 with nogil: 1168 if self._for_html: 1169 result = htmlparser.htmlCtxtReadFile( 1170 pctxt, c_filename, c_encoding, self._parse_options) 1171 if result is not NULL: 1172 if _fixHtmlDictNames(pctxt.dict, result) < 0: 1173 tree.xmlFreeDoc(result) 1174 result = NULL 1175 else: 1176 result = xmlparser.xmlCtxtReadFile( 1177 pctxt, c_filename, c_encoding, self._parse_options) 1178 pctxt.options = orig_options # work around libxml2 problem 1179 1180 return context._handleParseResultDoc(self, result, c_filename) 1181 finally: 1182 context.cleanup() 1183 1184 cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename, 1185 encoding) except NULL: 1186 cdef _ParserContext context 1187 cdef _FileReaderContext file_context 1188 cdef xmlDoc* result 1189 cdef xmlparser.xmlParserCtxt* pctxt 1190 cdef char* c_filename 1191 if not filename: 1192 filename = None 1193 1194 context = self._getParserContext() 1195 context.prepare() 1196 try: 1197 pctxt = context._c_ctxt 1198 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) 1199 file_context = _FileReaderContext( 1200 filelike, context, filename, 1201 encoding or self._default_encoding) 1202 result = file_context._readDoc(pctxt, self._parse_options) 1203 1204 return context._handleParseResultDoc( 1205 self, result, filename) 1206 finally: 1207 context.cleanup() 1208 1209 1210 cdef void _initSaxDocument(void* ctxt) with gil: 1211 xmlparser.xmlSAX2StartDocument(ctxt) 1212 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt 1213 c_doc = c_ctxt.myDoc 1214 1215 # set up document dict 1216 if c_doc and c_ctxt.dict and not c_doc.dict: 1217 # I have no idea why libxml2 disables this - we need it 1218 c_ctxt.dictNames = 1 1219 c_doc.dict = c_ctxt.dict 1220 xmlparser.xmlDictReference(c_ctxt.dict) 1221 1222 # set up XML ID hash table 1223 if c_ctxt._private: 1224 context = <_ParserContext>c_ctxt._private 1225 if context._collect_ids: 1226 # keep the global parser dict from filling up with XML IDs 1227 if c_doc and not c_doc.ids: 1228 # memory errors are not fatal here 1229 c_dict = xmlparser.xmlDictCreate() 1230 if c_dict: 1231 c_doc.ids = tree.xmlHashCreateDict(0, c_dict) 1232 xmlparser.xmlDictFree(c_dict) 1233 else: 1234 c_doc.ids = tree.xmlHashCreate(0) 1235 else: 1236 c_ctxt.loadsubset |= xmlparser.XML_SKIP_IDS 1237 if c_doc and c_doc.ids and not tree.xmlHashSize(c_doc.ids): 1238 # already initialised but empty => clear 1239 tree.xmlHashFree(c_doc.ids, NULL) 1240 c_doc.ids = NULL 1241 1242 1243 ############################################################ 1244 ## ET feed parser 1245 ############################################################ 1246 1247 cdef class _FeedParser(_BaseParser): 1248 cdef bint _feed_parser_running 1249 1250 @property 1251 def feed_error_log(self): 1252 """The error log of the last (or current) run of the feed parser. 1253 1254 Note that this is local to the feed parser and thus is 1255 different from what the ``error_log`` property returns. 1256 """ 1257 return self._getPushParserContext()._error_log.copy() 1258 1259 cpdef feed(self, data): 1260 u"""feed(self, data) 1261 1262 Feeds data to the parser. The argument should be an 8-bit string 1263 buffer containing encoded data, although Unicode is supported as long 1264 as both string types are not mixed. 1265 1266 This is the main entry point to the consumer interface of a 1267 parser. The parser will parse as much of the XML stream as it 1268 can on each call. To finish parsing or to reset the parser, 1269 call the ``close()`` method. Both methods may raise 1270 ParseError if errors occur in the input data. If an error is 1271 raised, there is no longer a need to call ``close()``. 1272 1273 The feed parser interface is independent of the normal parser 1274 usage. You can use the same parser as a feed parser and in 1275 the ``parse()`` function concurrently. 1276 """ 1277 cdef _ParserContext context 1278 cdef bytes bstring 1279 cdef xmlparser.xmlParserCtxt* pctxt 1280 cdef Py_ssize_t py_buffer_len, ustart 1281 cdef const_char* char_data 1282 cdef const_char* c_encoding 1283 cdef int buffer_len 1284 cdef int error 1285 cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER 1286 1287 if isinstance(data, bytes): 1288 if self._default_encoding is None: 1289 c_encoding = NULL 1290 else: 1291 c_encoding = self._default_encoding 1292 char_data = _cstr(data) 1293 py_buffer_len = python.PyBytes_GET_SIZE(data) 1294 ustart = 0 1295 elif isinstance(data, unicode): 1296 c_encoding = b"UTF-8" 1297 char_data = NULL 1298 py_buffer_len = len(<unicode> data) 1299 ustart = 0 1300 else: 1301 raise TypeError, u"Parsing requires string data" 1302 1303 context = self._getPushParserContext() 1304 pctxt = context._c_ctxt 1305 error = 0 1306 if not self._feed_parser_running: 1307 context.prepare(set_document_loader=False) 1308 self._feed_parser_running = 1 1309 c_filename = (_cstr(self._filename) 1310 if self._filename is not None else NULL) 1311 1312 # We have to give *mlCtxtResetPush() enough input to figure 1313 # out the character encoding (at least four bytes), 1314 # however if we give it all we got, we'll have nothing for 1315 # *mlParseChunk() and things go wrong. 1316 buffer_len = 0 1317 if char_data is not NULL: 1318 buffer_len = 4 if py_buffer_len > 4 else <int>py_buffer_len 1319 orig_loader = _register_document_loader() 1320 if self._for_html: 1321 error = _htmlCtxtResetPush( 1322 pctxt, char_data, buffer_len, c_filename, c_encoding, 1323 self._parse_options) 1324 else: 1325 xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options) 1326 error = xmlparser.xmlCtxtResetPush( 1327 pctxt, char_data, buffer_len, c_filename, c_encoding) 1328 _reset_document_loader(orig_loader) 1329 py_buffer_len -= buffer_len 1330 char_data += buffer_len 1331 if error: 1332 raise MemoryError() 1333 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) 1334 1335 #print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding 1336 1337 fixup_error = 0 1338 while py_buffer_len > 0 and (error == 0 or recover): 1339 if char_data is NULL: 1340 # Unicode parsing by converting chunks to UTF-8 1341 buffer_len = 2**19 # len(bytes) <= 4 * (2**19) == 2 MiB 1342 bstring = (<unicode> data)[ustart : ustart+buffer_len].encode('UTF-8') 1343 ustart += buffer_len 1344 py_buffer_len -= buffer_len # may end up < 0 1345 error, fixup_error = _parse_data_chunk(pctxt, <const char*> bstring, <int> len(bstring)) 1346 else: 1347 # Direct byte string parsing. 1348 buffer_len = <int>py_buffer_len if py_buffer_len <= limits.INT_MAX else limits.INT_MAX 1349 error, fixup_error = _parse_data_chunk(pctxt, char_data, buffer_len) 1350 py_buffer_len -= buffer_len 1351 char_data += buffer_len 1352 1353 if fixup_error: 1354 context.store_exception(MemoryError()) 1355 1356 if context._has_raised(): 1357 # propagate Python exceptions immediately 1358 recover = 0 1359 error = 1 1360 break 1361 1362 if error and not pctxt.replaceEntities and not pctxt.validate: 1363 # in this mode, we ignore errors about undefined entities 1364 for entry in context._error_log.filter_from_errors(): 1365 if entry.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \ 1366 entry.type != ErrorTypes.ERR_UNDECLARED_ENTITY: 1367 break 1368 else: 1369 error = 0 1370 1371 if not pctxt.wellFormed and pctxt.disableSAX and context._has_raised(): 1372 # propagate Python exceptions immediately 1373 recover = 0 1374 error = 1 1375 1376 if fixup_error or not recover and (error or not pctxt.wellFormed): 1377 self._feed_parser_running = 0 1378 try: 1379 context._handleParseResult(self, pctxt.myDoc, None) 1380 finally: 1381 context.cleanup() 1382 1383 cpdef close(self): 1384 u"""close(self) 1385 1386 Terminates feeding data to this parser. This tells the parser to 1387 process any remaining data in the feed buffer, and then returns the 1388 root Element of the tree that was parsed. 1389 1390 This method must be called after passing the last chunk of data into 1391 the ``feed()`` method. It should only be called when using the feed 1392 parser interface, all other usage is undefined. 1393 """ 1394 if not self._feed_parser_running: 1395 raise XMLSyntaxError(u"no element found", 1396 xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0, 1397 self._filename) 1398 1399 context = self._getPushParserContext() 1400 pctxt = context._c_ctxt 1401 1402 self._feed_parser_running = 0 1403 if self._for_html: 1404 htmlparser.htmlParseChunk(pctxt, NULL, 0, 1) 1405 else: 1406 xmlparser.xmlParseChunk(pctxt, NULL, 0, 1) 1407 1408 if (pctxt.recovery and not pctxt.disableSAX and 1409 isinstance(context, _SaxParserContext)): 1410 # apply any left-over 'end' events 1411 (<_SaxParserContext>context).flushEvents() 1412 1413 try: 1414 result = context._handleParseResult(self, pctxt.myDoc, None) 1415 finally: 1416 context.cleanup() 1417 1418 if isinstance(result, _Document): 1419 return (<_Document>result).getroot() 1420 else: 1421 return result 1422 1423 1424 cdef (int, int) _parse_data_chunk(xmlparser.xmlParserCtxt* c_ctxt, 1425 const char* char_data, int buffer_len): 1426 fixup_error = 0 1427 with nogil: 1428 if c_ctxt.html: 1429 c_node = c_ctxt.node # last node where the parser stopped 1430 orig_loader = _register_document_loader() 1431 error = htmlparser.htmlParseChunk(c_ctxt, char_data, buffer_len, 0) 1432 _reset_document_loader(orig_loader) 1433 # and now for the fun part: move node names to the dict 1434 if c_ctxt.myDoc: 1435 fixup_error = _fixHtmlDictSubtreeNames( 1436 c_ctxt.dict, c_ctxt.myDoc, c_node) 1437 if c_ctxt.myDoc.dict and c_ctxt.myDoc.dict is not c_ctxt.dict: 1438 xmlparser.xmlDictFree(c_ctxt.myDoc.dict) 1439 c_ctxt.myDoc.dict = c_ctxt.dict 1440 xmlparser.xmlDictReference(c_ctxt.dict) 1441 else: 1442 orig_loader = _register_document_loader() 1443 error = xmlparser.xmlParseChunk(c_ctxt, char_data, buffer_len, 0) 1444 _reset_document_loader(orig_loader) 1445 return (error, fixup_error) 1446 1447 1448 cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt, 1449 const_char* c_data, int buffer_len, 1450 const_char* c_filename, const_char* c_encoding, 1451 int parse_options) except -1: 1452 cdef xmlparser.xmlParserInput* c_input_stream 1453 # libxml2 lacks an HTML push parser setup function 1454 error = xmlparser.xmlCtxtResetPush( 1455 c_ctxt, c_data, buffer_len, c_filename, c_encoding) 1456 if error: 1457 return error 1458 1459 # fix libxml2 setup for HTML 1460 c_ctxt.progressive = 1 1461 c_ctxt.html = 1 1462 htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options) 1463 1464 return 0 1465 1466 1467 ############################################################ 1468 ## XML parser 1469 ############################################################ 1470 1471 cdef int _XML_DEFAULT_PARSE_OPTIONS 1472 _XML_DEFAULT_PARSE_OPTIONS = ( 1473 xmlparser.XML_PARSE_NOENT | 1474 xmlparser.XML_PARSE_NOCDATA | 1475 xmlparser.XML_PARSE_NONET | 1476 xmlparser.XML_PARSE_COMPACT | 1477 xmlparser.XML_PARSE_BIG_LINES 1478 ) 1479 1480 cdef class XMLParser(_FeedParser): 1481 u"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, schema: XMLSchema =None, huge_tree=False, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, collect_ids=True, target=None, compact=True) 1482 1483 The XML parser. 1484 1485 Parsers can be supplied as additional argument to various parse 1486 functions of the lxml API. A default parser is always available 1487 and can be replaced by a call to the global function 1488 'set_default_parser'. New parsers can be created at any time 1489 without a major run-time overhead. 1490 1491 The keyword arguments in the constructor are mainly based on the 1492 libxml2 parser configuration. A DTD will also be loaded if DTD 1493 validation or attribute default values are requested (unless you 1494 additionally provide an XMLSchema from which the default 1495 attributes can be read). 1496 1497 Available boolean keyword arguments: 1498 1499 - attribute_defaults - inject default attributes from DTD or XMLSchema 1500 - dtd_validation - validate against a DTD referenced by the document 1501 - load_dtd - use DTD for parsing 1502 - no_network - prevent network access for related files (default: True) 1503 - ns_clean - clean up redundant namespace declarations 1504 - recover - try hard to parse through broken XML 1505 - remove_blank_text - discard blank text nodes that appear ignorable 1506 - remove_comments - discard comments 1507 - remove_pis - discard processing instructions 1508 - strip_cdata - replace CDATA sections by normal text content (default: True) 1509 - compact - save memory for short text content (default: True) 1510 - collect_ids - use a hash table of XML IDs for fast access (default: True, always True with DTD validation) 1511 - resolve_entities - replace entities by their text value (default: True) 1512 - huge_tree - disable security restrictions and support very deep trees 1513 and very long text content (only affects libxml2 2.7+) 1514 1515 Other keyword arguments: 1516 1517 - encoding - override the document encoding 1518 - target - a parser target object that will receive the parse events 1519 - schema - an XMLSchema to validate against 1520 1521 Note that you should avoid sharing parsers between threads. While this is 1522 not harmful, it is more efficient to use separate parsers. This does not 1523 apply to the default parser. 1524 """ 1525 def __init__(self, *, encoding=None, attribute_defaults=False, 1526 dtd_validation=False, load_dtd=False, no_network=True, 1527 ns_clean=False, recover=False, XMLSchema schema=None, 1528 huge_tree=False, remove_blank_text=False, resolve_entities=True, 1529 remove_comments=False, remove_pis=False, strip_cdata=True, 1530 collect_ids=True, target=None, compact=True): 1531 cdef int parse_options 1532 parse_options = _XML_DEFAULT_PARSE_OPTIONS 1533 if load_dtd: 1534 parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD 1535 if dtd_validation: 1536 parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \ 1537 xmlparser.XML_PARSE_DTDLOAD 1538 if attribute_defaults: 1539 parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR 1540 if schema is None: 1541 parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD 1542 if ns_clean: 1543 parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN 1544 if recover: 1545 parse_options = parse_options | xmlparser.XML_PARSE_RECOVER 1546 if remove_blank_text: 1547 parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS 1548 if huge_tree: 1549 parse_options = parse_options | xmlparser.XML_PARSE_HUGE 1550 if not no_network: 1551 parse_options = parse_options ^ xmlparser.XML_PARSE_NONET 1552 if not compact: 1553 parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT 1554 if not resolve_entities: 1555 parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT 1556 if not strip_cdata: 1557 parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA 1558 1559 _BaseParser.__init__(self, parse_options, 0, schema, 1560 remove_comments, remove_pis, strip_cdata, 1561 collect_ids, target, encoding) 1562 1563 1564 cdef class XMLPullParser(XMLParser): 1565 """XMLPullParser(self, events=None, *, tag=None, **kwargs) 1566 1567 XML parser that collects parse events in an iterator. 1568 1569 The collected events are the same as for iterparse(), but the 1570 parser itself is non-blocking in the sense that it receives 1571 data chunks incrementally through its .feed() method, instead 1572 of reading them directly from a file(-like) object all by itself. 1573 1574 By default, it collects Element end events. To change that, 1575 pass any subset of the available events into the ``events`` 1576 argument: ``'start'``, ``'end'``, ``'start-ns'``, 1577 ``'end-ns'``, ``'comment'``, ``'pi'``. 1578 1579 To support loading external dependencies relative to the input 1580 source, you can pass the ``base_url``. 1581 """ 1582 def __init__(self, events=None, *, tag=None, base_url=None, **kwargs): 1583 XMLParser.__init__(self, **kwargs) 1584 if events is None: 1585 events = ('end',) 1586 self._setBaseURL(base_url) 1587 self._collectEvents(events, tag) 1588 1589 def read_events(self): 1590 return (<_SaxParserContext?>self._getPushParserContext()).events_iterator 1591 1592 1593 cdef class ETCompatXMLParser(XMLParser): 1594 u"""ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \ 1595 dtd_validation=False, load_dtd=False, no_network=True, \ 1596 ns_clean=False, recover=False, schema=None, \ 1597 huge_tree=False, remove_blank_text=False, resolve_entities=True, \ 1598 remove_comments=True, remove_pis=True, strip_cdata=True, \ 1599 target=None, compact=True) 1600 1601 An XML parser with an ElementTree compatible default setup. 1602 1603 See the XMLParser class for details. 1604 1605 This parser has ``remove_comments`` and ``remove_pis`` enabled by default 1606 and thus ignores comments and processing instructions. 1607 """ 1608 def __init__(self, *, encoding=None, attribute_defaults=False, 1609 dtd_validation=False, load_dtd=False, no_network=True, 1610 ns_clean=False, recover=False, schema=None, 1611 huge_tree=False, remove_blank_text=False, resolve_entities=True, 1612 remove_comments=True, remove_pis=True, strip_cdata=True, 1613 target=None, compact=True): 1614 XMLParser.__init__(self, 1615 attribute_defaults=attribute_defaults, 1616 dtd_validation=dtd_validation, 1617 load_dtd=load_dtd, 1618 no_network=no_network, 1619 ns_clean=ns_clean, 1620 recover=recover, 1621 remove_blank_text=remove_blank_text, 1622 huge_tree=huge_tree, 1623 compact=compact, 1624 resolve_entities=resolve_entities, 1625 remove_comments=remove_comments, 1626 remove_pis=remove_pis, 1627 strip_cdata=strip_cdata, 1628 target=target, 1629 encoding=encoding, 1630 schema=schema) 1631 1632 # ET 1.2 compatible name 1633 XMLTreeBuilder = ETCompatXMLParser 1634 1635 1636 cdef XMLParser __DEFAULT_XML_PARSER 1637 __DEFAULT_XML_PARSER = XMLParser() 1638 1639 __GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER) 1640 1641 def set_default_parser(_BaseParser parser=None): 1642 u"""set_default_parser(parser=None) 1643 1644 Set a default parser for the current thread. This parser is used 1645 globally whenever no parser is supplied to the various parse functions of 1646 the lxml API. If this function is called without a parser (or if it is 1647 None), the default parser is reset to the original configuration. 1648 1649 Note that the pre-installed default parser is not thread-safe. Avoid the 1650 default parser in multi-threaded environments. You can create a separate 1651 parser for each thread explicitly or use a parser pool. 1652 """ 1653 if parser is None: 1654 parser = __DEFAULT_XML_PARSER 1655 __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser) 1656 1657 def get_default_parser(): 1658 u"get_default_parser()" 1659 return __GLOBAL_PARSER_CONTEXT.getDefaultParser() 1660 1661 ############################################################ 1662 ## HTML parser 1663 ############################################################ 1664 1665 cdef int _HTML_DEFAULT_PARSE_OPTIONS 1666 _HTML_DEFAULT_PARSE_OPTIONS = ( 1667 htmlparser.HTML_PARSE_RECOVER | 1668 htmlparser.HTML_PARSE_NONET | 1669 htmlparser.HTML_PARSE_COMPACT 1670 ) 1671 1672 cdef class HTMLParser(_FeedParser): 1673 u"""HTMLParser(self, encoding=None, remove_blank_text=False, \ 1674 remove_comments=False, remove_pis=False, strip_cdata=True, \ 1675 no_network=True, target=None, schema: XMLSchema =None, \ 1676 recover=True, compact=True, collect_ids=True, huge_tree=False) 1677 1678 The HTML parser. 1679 1680 This parser allows reading HTML into a normal XML tree. By 1681 default, it can read broken (non well-formed) HTML, depending on 1682 the capabilities of libxml2. Use the 'recover' option to switch 1683 this off. 1684 1685 Available boolean keyword arguments: 1686 1687 - recover - try hard to parse through broken HTML (default: True) 1688 - no_network - prevent network access for related files (default: True) 1689 - remove_blank_text - discard empty text nodes that are ignorable (i.e. not actual text content) 1690 - remove_comments - discard comments 1691 - remove_pis - discard processing instructions 1692 - strip_cdata - replace CDATA sections by normal text content (default: True) 1693 - compact - save memory for short text content (default: True) 1694 - default_doctype - add a default doctype even if it is not found in the HTML (default: True) 1695 - collect_ids - use a hash table of XML IDs for fast access (default: True) 1696 - huge_tree - disable security restrictions and support very deep trees 1697 and very long text content (only affects libxml2 2.7+) 1698 1699 Other keyword arguments: 1700 1701 - encoding - override the document encoding 1702 - target - a parser target object that will receive the parse events 1703 - schema - an XMLSchema to validate against 1704 1705 Note that you should avoid sharing parsers between threads for performance 1706 reasons. 1707 """ 1708 def __init__(self, *, encoding=None, remove_blank_text=False, 1709 remove_comments=False, remove_pis=False, strip_cdata=True, 1710 no_network=True, target=None, XMLSchema schema=None, 1711 recover=True, compact=True, default_doctype=True, 1712 collect_ids=True, huge_tree=False): 1713 cdef int parse_options 1714 parse_options = _HTML_DEFAULT_PARSE_OPTIONS 1715 if remove_blank_text: 1716 parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS 1717 if not recover: 1718 parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER 1719 if not no_network: 1720 parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET 1721 if not compact: 1722 parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT 1723 if not default_doctype: 1724 parse_options = parse_options ^ htmlparser.HTML_PARSE_NODEFDTD 1725 if huge_tree: 1726 parse_options = parse_options | xmlparser.XML_PARSE_HUGE 1727 1728 _BaseParser.__init__(self, parse_options, 1, schema, 1729 remove_comments, remove_pis, strip_cdata, 1730 collect_ids, target, encoding) 1731 1732 1733 cdef HTMLParser __DEFAULT_HTML_PARSER 1734 __DEFAULT_HTML_PARSER = HTMLParser() 1735 1736 1737 cdef class HTMLPullParser(HTMLParser): 1738 """HTMLPullParser(self, events=None, *, tag=None, base_url=None, **kwargs) 1739 1740 HTML parser that collects parse events in an iterator. 1741 1742 The collected events are the same as for iterparse(), but the 1743 parser itself is non-blocking in the sense that it receives 1744 data chunks incrementally through its .feed() method, instead 1745 of reading them directly from a file(-like) object all by itself. 1746 1747 By default, it collects Element end events. To change that, 1748 pass any subset of the available events into the ``events`` 1749 argument: ``'start'``, ``'end'``, ``'start-ns'``, 1750 ``'end-ns'``, ``'comment'``, ``'pi'``. 1751 1752 To support loading external dependencies relative to the input 1753 source, you can pass the ``base_url``. 1754 """ 1755 def __init__(self, events=None, *, tag=None, base_url=None, **kwargs): 1756 HTMLParser.__init__(self, **kwargs) 1757 if events is None: 1758 events = ('end',) 1759 self._setBaseURL(base_url) 1760 self._collectEvents(events, tag) 1761 1762 def read_events(self): 1763 return (<_SaxParserContext?>self._getPushParserContext()).events_iterator 1764 1765 1766 ############################################################ 1767 ## helper functions for document creation 1768 ############################################################ 1769 1770 cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL: 1771 cdef char* c_filename 1772 cdef char* c_text 1773 cdef Py_ssize_t c_len 1774 cdef bint is_pep393_string 1775 if parser is None: 1776 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() 1777 if not filename: 1778 c_filename = NULL 1779 else: 1780 filename_utf = _encodeFilenameUTF8(filename) 1781 c_filename = _cstr(filename_utf) 1782 if isinstance(text, unicode): 1783 is_pep393_string = ( 1784 python.PEP393_ENABLED and python.PyUnicode_IS_READY(text)) 1785 if is_pep393_string: 1786 c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text) 1787 else: 1788 c_len = python.PyUnicode_GET_DATA_SIZE(text) 1789 if c_len > limits.INT_MAX: 1790 return (<_BaseParser>parser)._parseDocFromFilelike( 1791 StringIO(text), filename, None) 1792 if _PY_UNICODE_ENCODING is NULL and not is_pep393_string: 1793 text = (<unicode>text).encode('utf8') 1794 return (<_BaseParser>parser)._parseDocFromFilelike( 1795 BytesIO(text), filename, "UTF-8") 1796 return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename) 1797 else: 1798 c_len = python.PyBytes_GET_SIZE(text) 1799 if c_len > limits.INT_MAX: 1800 return (<_BaseParser>parser)._parseDocFromFilelike( 1801 BytesIO(text), filename, None) 1802 c_text = _cstr(text) 1803 return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename) 1804 1805 cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL: 1806 if parser is None: 1807 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() 1808 return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8)) 1809 1810 cdef xmlDoc* _parseDocFromFilelike(source, filename, 1811 _BaseParser parser) except NULL: 1812 if parser is None: 1813 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() 1814 return (<_BaseParser>parser)._parseDocFromFilelike(source, filename, None) 1815 1816 cdef xmlDoc* _newXMLDoc() except NULL: 1817 cdef xmlDoc* result 1818 result = tree.xmlNewDoc(NULL) 1819 if result is NULL: 1820 raise MemoryError() 1821 if result.encoding is NULL: 1822 result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8") 1823 __GLOBAL_PARSER_CONTEXT.initDocDict(result) 1824 return result 1825 1826 cdef xmlDoc* _newHTMLDoc() except NULL: 1827 cdef xmlDoc* result 1828 result = tree.htmlNewDoc(NULL, NULL) 1829 if result is NULL: 1830 raise MemoryError() 1831 __GLOBAL_PARSER_CONTEXT.initDocDict(result) 1832 return result 1833 1834 cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL: 1835 cdef xmlDoc* result 1836 if recursive: 1837 with nogil: 1838 result = tree.xmlCopyDoc(c_doc, recursive) 1839 else: 1840 result = tree.xmlCopyDoc(c_doc, 0) 1841 if result is NULL: 1842 raise MemoryError() 1843 __GLOBAL_PARSER_CONTEXT.initDocDict(result) 1844 return result 1845 1846 cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL: 1847 u"Recursively copy the document and make c_new_root the new root node." 1848 cdef xmlDoc* result 1849 cdef xmlNode* c_node 1850 result = tree.xmlCopyDoc(c_doc, 0) # non recursive 1851 __GLOBAL_PARSER_CONTEXT.initDocDict(result) 1852 with nogil: 1853 c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive 1854 if c_node is NULL: 1855 raise MemoryError() 1856 tree.xmlDocSetRootElement(result, c_node) 1857 _copyTail(c_new_root.next, c_node) 1858 return result 1859 1860 cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL: 1861 u"Recursively copy the element into the document. c_doc is not modified." 1862 cdef xmlNode* c_root 1863 c_root = tree.xmlDocCopyNode(c_node, c_doc, 1) # recursive 1864 if c_root is NULL: 1865 raise MemoryError() 1866 _copyTail(c_node.next, c_root) 1867 return c_root 1868 1869 1870 ############################################################ 1871 ## API level helper functions for _Document creation 1872 ############################################################ 1873 1874 cdef _Document _parseDocument(source, _BaseParser parser, base_url): 1875 cdef _Document doc 1876 source = _getFSPathOrObject(source) 1877 if _isString(source): 1878 # parse the file directly from the filesystem 1879 doc = _parseDocumentFromURL(_encodeFilename(source), parser) 1880 # fix base URL if requested 1881 if base_url is not None: 1882 base_url = _encodeFilenameUTF8(base_url) 1883 if doc._c_doc.URL is not NULL: 1884 tree.xmlFree(<char*>doc._c_doc.URL) 1885 doc._c_doc.URL = tree.xmlStrdup(_xcstr(base_url)) 1886 return doc 1887 1888 if base_url is not None: 1889 url = base_url 1890 else: 1891 url = _getFilenameForFile(source) 1892 1893 if hasattr(source, u'getvalue') and hasattr(source, u'tell'): 1894 # StringIO - reading from start? 1895 if source.tell() == 0: 1896 return _parseMemoryDocument(source.getvalue(), url, parser) 1897 1898 # Support for file-like objects (urlgrabber.urlopen, ...) 1899 if hasattr(source, u'read'): 1900 return _parseFilelikeDocument(source, url, parser) 1901 1902 raise TypeError, f"cannot parse from '{python._fqtypename(source).decode('UTF-8')}'" 1903 1904 cdef _Document _parseDocumentFromURL(url, _BaseParser parser): 1905 c_doc = _parseDocFromFile(url, parser) 1906 return _documentFactory(c_doc, parser) 1907 1908 cdef _Document _parseMemoryDocument(text, url, _BaseParser parser): 1909 if isinstance(text, unicode): 1910 if _hasEncodingDeclaration(text): 1911 raise ValueError( 1912 u"Unicode strings with encoding declaration are not supported. " 1913 u"Please use bytes input or XML fragments without declaration.") 1914 elif not isinstance(text, bytes): 1915 raise ValueError, u"can only parse strings" 1916 c_doc = _parseDoc(text, url, parser) 1917 return _documentFactory(c_doc, parser) 1918 1919 cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser): 1920 c_doc = _parseDocFromFilelike(source, url, parser) 1921 return _documentFactory(c_doc, parser)