Cradicle Explorer

/ lib / lxml / parser.pxi
parser.pxi
   1  # Parsers for XML and HTML
   2  
   3  from lxml.includes cimport xmlparser
   4  from lxml.includes cimport htmlparser
   5  
   6  
   7  class ParseError(LxmlSyntaxError):
   8      """Syntax error while parsing an XML document.
   9  
  10      For compatibility with ElementTree 1.3 and later.
  11      """
  12      def __init__(self, message, code, line, column, filename=None):
  13          super(_ParseError, self).__init__(message)
  14          self.lineno, self.offset = (line, column - 1)
  15          self.code = code
  16          self.filename = filename
  17  
  18      @property
  19      def position(self):
  20          return self.lineno, self.offset + 1
  21  
  22      @position.setter
  23      def position(self, new_pos):
  24          self.lineno, column = new_pos
  25          self.offset = column - 1
  26  
  27  cdef object _ParseError = ParseError
  28  
  29  
  30  class XMLSyntaxError(ParseError):
  31      """Syntax error while parsing an XML document.
  32      """
  33  
  34  cdef class ParserError(LxmlError):
  35      """Internal lxml parser error.
  36      """
  37  
  38  
  39  @cython.final
  40  @cython.internal
  41  cdef class _ParserDictionaryContext:
  42      # Global parser context to share the string dictionary.
  43      #
  44      # This class is a delegate singleton!
  45      #
  46      # It creates _ParserDictionaryContext objects for each thread to keep thread state,
  47      # but those must never be used directly.  Always stick to using the static
  48      # __GLOBAL_PARSER_CONTEXT as defined below the class.
  49      #
  50  
  51      cdef tree.xmlDict* _c_dict
  52      cdef _BaseParser _default_parser
  53      cdef list _implied_parser_contexts
  54  
  55      def __cinit__(self):
  56          self._c_dict = NULL
  57          self._implied_parser_contexts = []
  58  
  59      def __dealloc__(self):
  60          if self._c_dict is not NULL:
  61              xmlparser.xmlDictFree(self._c_dict)
  62  
  63      cdef void initMainParserContext(self):
  64          u"""Put the global context into the thread dictionary of the main
  65          thread.  To be called once and only in the main thread."""
  66          thread_dict = python.PyThreadState_GetDict()
  67          if thread_dict is not NULL:
  68              (<dict>thread_dict)[u"_ParserDictionaryContext"] = self
  69  
  70      cdef _ParserDictionaryContext _findThreadParserContext(self):
  71          u"Find (or create) the _ParserDictionaryContext object for the current thread"
  72          cdef _ParserDictionaryContext context
  73          thread_dict = python.PyThreadState_GetDict()
  74          if thread_dict is NULL:
  75              return self
  76          d = <dict>thread_dict
  77          result = python.PyDict_GetItem(d, u"_ParserDictionaryContext")
  78          if result is not NULL:
  79              return <object>result
  80          context = <_ParserDictionaryContext>_ParserDictionaryContext.__new__(_ParserDictionaryContext)
  81          d[u"_ParserDictionaryContext"] = context
  82          return context
  83  
  84      cdef void setDefaultParser(self, _BaseParser parser):
  85          u"Set the default parser for the current thread"
  86          cdef _ParserDictionaryContext context
  87          context = self._findThreadParserContext()
  88          context._default_parser = parser
  89  
  90      cdef _BaseParser getDefaultParser(self):
  91          u"Return (or create) the default parser of the current thread"
  92          cdef _ParserDictionaryContext context
  93          context = self._findThreadParserContext()
  94          if context._default_parser is None:
  95              if self._default_parser is None:
  96                  self._default_parser = __DEFAULT_XML_PARSER._copy()
  97              if context is not self:
  98                  context._default_parser = self._default_parser._copy()
  99          return context._default_parser
 100  
 101      cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default):
 102          u"Return the thread-local dict or create a new one if necessary."
 103          cdef _ParserDictionaryContext context
 104          context = self._findThreadParserContext()
 105          if context._c_dict is NULL:
 106              # thread dict not yet set up => use default or create a new one
 107              if default is not NULL:
 108                  context._c_dict = default
 109                  xmlparser.xmlDictReference(default)
 110                  return default
 111              if self._c_dict is NULL:
 112                  self._c_dict = xmlparser.xmlDictCreate()
 113              if context is not self:
 114                  context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict)
 115          return context._c_dict
 116  
 117      cdef void initThreadDictRef(self, tree.xmlDict** c_dict_ref):
 118          c_dict = c_dict_ref[0]
 119          c_thread_dict = self._getThreadDict(c_dict)
 120          if c_dict is c_thread_dict:
 121              return
 122          if c_dict is not NULL:
 123              xmlparser.xmlDictFree(c_dict)
 124          c_dict_ref[0] = c_thread_dict
 125          xmlparser.xmlDictReference(c_thread_dict)
 126  
 127      cdef void initParserDict(self, xmlparser.xmlParserCtxt* pctxt):
 128          u"Assure we always use the same string dictionary."
 129          self.initThreadDictRef(&pctxt.dict)
 130          pctxt.dictNames = 1
 131  
 132      cdef void initXPathParserDict(self, xpath.xmlXPathContext* pctxt):
 133          u"Assure we always use the same string dictionary."
 134          self.initThreadDictRef(&pctxt.dict)
 135  
 136      cdef void initDocDict(self, xmlDoc* result):
 137          u"Store dict of last object parsed if no shared dict yet"
 138          # XXX We also free the result dict here if there already was one.
 139          # This case should only occur for new documents with empty dicts,
 140          # otherwise we'd free data that's in use => segfault
 141          self.initThreadDictRef(&result.dict)
 142  
 143      cdef _ParserContext findImpliedContext(self):
 144          u"""Return any current implied xml parser context for the current
 145          thread.  This is used when the resolver functions are called
 146          with an xmlParserCtxt that was generated from within libxml2
 147          (i.e. without a _ParserContext) - which happens when parsing
 148          schema and xinclude external references."""
 149          cdef _ParserDictionaryContext context
 150          cdef _ParserContext implied_context
 151  
 152          # see if we have a current implied parser
 153          context = self._findThreadParserContext()
 154          if context._implied_parser_contexts:
 155              implied_context = context._implied_parser_contexts[-1]
 156              return implied_context
 157          return None
 158  
 159      cdef void pushImpliedContextFromParser(self, _BaseParser parser):
 160          u"Push a new implied context object taken from the parser."
 161          if parser is not None:
 162              self.pushImpliedContext(parser._getParserContext())
 163          else:
 164              self.pushImpliedContext(None)
 165  
 166      cdef void pushImpliedContext(self, _ParserContext parser_context):
 167          u"Push a new implied context object."
 168          cdef _ParserDictionaryContext context
 169          context = self._findThreadParserContext()
 170          context._implied_parser_contexts.append(parser_context)
 171  
 172      cdef void popImpliedContext(self):
 173          u"Pop the current implied context object."
 174          cdef _ParserDictionaryContext context
 175          context = self._findThreadParserContext()
 176          context._implied_parser_contexts.pop()
 177  
 178  cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext()
 179  __GLOBAL_PARSER_CONTEXT.initMainParserContext()
 180  
 181  ############################################################
 182  ## support for Python unicode I/O
 183  ############################################################
 184  
 185  # name of Python Py_UNICODE encoding as known to libxml2
 186  cdef const_char* _PY_UNICODE_ENCODING = NULL
 187  
 188  cdef int _setupPythonUnicode() except -1:
 189      u"""Sets _PY_UNICODE_ENCODING to the internal encoding name of Python unicode
 190      strings if libxml2 supports reading native Python unicode.  This depends
 191      on iconv and the local Python installation, so we simply check if we find
 192      a matching encoding handler.
 193      """
 194      cdef tree.xmlCharEncodingHandler* enchandler
 195      cdef Py_ssize_t l
 196      cdef const_char* enc
 197      cdef Py_UNICODE *uchars = [c'<', c't', c'e', c's', c't', c'/', c'>']
 198      cdef const_xmlChar* buffer = <const_xmlChar*>uchars
 199      # apparently, libxml2 can't detect UTF-16 on some systems
 200      if (buffer[0] == c'<' and buffer[1] == c'\0' and
 201              buffer[2] == c't' and buffer[3] == c'\0'):
 202          enc = "UTF-16LE"
 203      elif (buffer[0] == c'\0' and buffer[1] == c'<' and
 204              buffer[2] == c'\0' and buffer[3] == c't'):
 205          enc = "UTF-16BE"
 206      else:
 207          # let libxml2 give it a try
 208          enc = _findEncodingName(buffer, sizeof(Py_UNICODE) * 7)
 209          if enc is NULL:
 210              # not my fault, it's YOUR broken system :)
 211              return 0
 212      enchandler = tree.xmlFindCharEncodingHandler(enc)
 213      if enchandler is not NULL:
 214          global _PY_UNICODE_ENCODING
 215          tree.xmlCharEncCloseFunc(enchandler)
 216          _PY_UNICODE_ENCODING = enc
 217      return 0
 218  
 219  cdef const_char* _findEncodingName(const_xmlChar* buffer, int size):
 220      u"Work around bug in libxml2: find iconv name of encoding on our own."
 221      cdef tree.xmlCharEncoding enc
 222      enc = tree.xmlDetectCharEncoding(buffer, size)
 223      if enc == tree.XML_CHAR_ENCODING_UTF16LE:
 224          if size >= 4 and (buffer[0] == <const_xmlChar>'\xFF' and
 225                            buffer[1] == <const_xmlChar>'\xFE' and
 226                            buffer[2] == 0 and buffer[3] == 0):
 227              return "UTF-32LE"  # according to BOM
 228          else:
 229              return "UTF-16LE"
 230      elif enc == tree.XML_CHAR_ENCODING_UTF16BE:
 231          return "UTF-16BE"
 232      elif enc == tree.XML_CHAR_ENCODING_UCS4LE:
 233          return "UCS-4LE"
 234      elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
 235          return "UCS-4BE"
 236      elif enc == tree.XML_CHAR_ENCODING_NONE:
 237          return NULL
 238      else:
 239          # returns a constant char*, no need to free it
 240          return tree.xmlGetCharEncodingName(enc)
 241  
 242  _setupPythonUnicode()
 243  
 244  ############################################################
 245  ## support for file-like objects
 246  ############################################################
 247  
 248  @cython.final
 249  @cython.internal
 250  cdef class _FileReaderContext:
 251      cdef object _filelike
 252      cdef object _encoding
 253      cdef object _url
 254      cdef object _bytes
 255      cdef _ExceptionContext _exc_context
 256      cdef Py_ssize_t _bytes_read
 257      cdef char* _c_url
 258      cdef bint _close_file_after_read
 259  
 260      def __cinit__(self, filelike, exc_context not None, url, encoding=None, bint close_file=False):
 261          self._exc_context = exc_context
 262          self._filelike = filelike
 263          self._close_file_after_read = close_file
 264          self._encoding = encoding
 265          if url is None:
 266              self._c_url = NULL
 267          else:
 268              url = _encodeFilename(url)
 269              self._c_url = _cstr(url)
 270          self._url = url
 271          self._bytes  = b''
 272          self._bytes_read = 0
 273  
 274      cdef _close_file(self):
 275          if self._filelike is None or not self._close_file_after_read:
 276              return
 277          try:
 278              close = self._filelike.close
 279          except AttributeError:
 280              close = None
 281          finally:
 282              self._filelike = None
 283          if close is not None:
 284              close()
 285  
 286      cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self):
 287          cdef stdio.FILE* c_stream
 288          cdef xmlparser.xmlParserInputBuffer* c_buffer
 289          c_buffer = xmlparser.xmlAllocParserInputBuffer(0)
 290          c_stream = python.PyFile_AsFile(self._filelike)
 291          if c_stream is NULL:
 292              c_buffer.readcallback  = _readFilelikeParser
 293              c_buffer.context = <python.PyObject*>self
 294          else:
 295              c_buffer.readcallback  = _readFileParser
 296              c_buffer.context = c_stream
 297          return c_buffer
 298  
 299      cdef xmlparser.xmlParserInput* _createParserInput(
 300              self, xmlparser.xmlParserCtxt* ctxt):
 301          cdef xmlparser.xmlParserInputBuffer* c_buffer
 302          c_buffer = self._createParserInputBuffer()
 303          return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0)
 304  
 305      cdef tree.xmlDtd* _readDtd(self):
 306          cdef xmlparser.xmlParserInputBuffer* c_buffer
 307          c_buffer = self._createParserInputBuffer()
 308          with nogil:
 309              return xmlparser.xmlIOParseDTD(NULL, c_buffer, 0)
 310  
 311      cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options):
 312          cdef xmlDoc* result
 313          cdef char* c_encoding
 314          cdef stdio.FILE* c_stream
 315          cdef xmlparser.xmlInputReadCallback c_read_callback
 316          cdef xmlparser.xmlInputCloseCallback c_close_callback
 317          cdef void* c_callback_context
 318  
 319          if self._encoding is None:
 320              c_encoding = NULL
 321          else:
 322              c_encoding = _cstr(self._encoding)
 323  
 324          c_stream = python.PyFile_AsFile(self._filelike)
 325          if c_stream is NULL:
 326              c_read_callback  = _readFilelikeParser
 327              c_callback_context = <python.PyObject*>self
 328          else:
 329              c_read_callback  = _readFileParser
 330              c_callback_context = c_stream
 331  
 332          orig_options = ctxt.options
 333          with nogil:
 334              if ctxt.html:
 335                  result = htmlparser.htmlCtxtReadIO(
 336                          ctxt, c_read_callback, NULL, c_callback_context,
 337                          self._c_url, c_encoding, options)
 338                  if result is not NULL:
 339                      if _fixHtmlDictNames(ctxt.dict, result) < 0:
 340                          tree.xmlFreeDoc(result)
 341                          result = NULL
 342              else:
 343                  result = xmlparser.xmlCtxtReadIO(
 344                      ctxt, c_read_callback, NULL, c_callback_context,
 345                      self._c_url, c_encoding, options)
 346          ctxt.options = orig_options # work around libxml2 problem
 347          try:
 348              self._close_file()
 349          except:
 350              self._exc_context._store_raised()
 351          finally:
 352              return result  # swallow any exceptions
 353  
 354      cdef int copyToBuffer(self, char* c_buffer, int c_requested):
 355          cdef int c_byte_count = 0
 356          cdef char* c_start
 357          cdef Py_ssize_t byte_count, remaining
 358          if self._bytes_read < 0:
 359              return 0
 360          try:
 361              byte_count = python.PyBytes_GET_SIZE(self._bytes)
 362              remaining  = byte_count - self._bytes_read
 363              while c_requested > remaining:
 364                  c_start = _cstr(self._bytes) + self._bytes_read
 365                  cstring_h.memcpy(c_buffer, c_start, remaining)
 366                  c_byte_count += remaining
 367                  c_buffer += remaining
 368                  c_requested -= remaining
 369  
 370                  self._bytes = self._filelike.read(c_requested)
 371                  if not isinstance(self._bytes, bytes):
 372                      if isinstance(self._bytes, unicode):
 373                          if self._encoding is None:
 374                              self._bytes = (<unicode>self._bytes).encode('utf8')
 375                          else:
 376                              self._bytes = python.PyUnicode_AsEncodedString(
 377                                  self._bytes, _cstr(self._encoding), NULL)
 378                      else:
 379                          self._close_file()
 380                          raise TypeError, \
 381                              u"reading from file-like objects must return byte strings or unicode strings"
 382  
 383                  remaining = python.PyBytes_GET_SIZE(self._bytes)
 384                  if remaining == 0:
 385                      self._bytes_read = -1
 386                      self._close_file()
 387                      return c_byte_count
 388                  self._bytes_read = 0
 389  
 390              if c_requested > 0:
 391                  c_start = _cstr(self._bytes) + self._bytes_read
 392                  cstring_h.memcpy(c_buffer, c_start, c_requested)
 393                  c_byte_count += c_requested
 394                  self._bytes_read += c_requested
 395          except:
 396              c_byte_count = -1
 397              self._exc_context._store_raised()
 398              try:
 399                  self._close_file()
 400              except:
 401                  self._exc_context._store_raised()
 402          finally:
 403              return c_byte_count  # swallow any exceptions
 404  
 405  cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) with gil:
 406      return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
 407  
 408  cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) nogil:
 409      return stdio.fread(c_buffer, 1,  c_size, <stdio.FILE*>ctxt)
 410  
 411  ############################################################
 412  ## support for custom document loaders
 413  ############################################################
 414  
 415  cdef xmlparser.xmlParserInput* _local_resolver(const_char* c_url, const_char* c_pubid,
 416                                                 xmlparser.xmlParserCtxt* c_context) with gil:
 417      cdef _ResolverContext context
 418      cdef xmlparser.xmlParserInput* c_input
 419      cdef _InputDocument doc_ref
 420      cdef _FileReaderContext file_context
 421      # if there is no _ParserContext associated with the xmlParserCtxt
 422      # passed, check to see if the thread state object has an implied
 423      # context.
 424      if c_context._private is not NULL:
 425          context = <_ResolverContext>c_context._private
 426      else:
 427          context = __GLOBAL_PARSER_CONTEXT.findImpliedContext()
 428  
 429      if context is None:
 430          if __DEFAULT_ENTITY_LOADER is NULL:
 431              return NULL
 432          with nogil:
 433              # free the GIL as we might do serious I/O here (e.g. HTTP)
 434              c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
 435          return c_input
 436  
 437      try:
 438          if c_url is NULL:
 439              url = None
 440          else:
 441              # parsing a related document (DTD etc.) => UTF-8 encoded URL?
 442              url = _decodeFilename(<const_xmlChar*>c_url)
 443          if c_pubid is NULL:
 444              pubid = None
 445          else:
 446              pubid = funicode(<const_xmlChar*>c_pubid) # always UTF-8
 447  
 448          doc_ref = context._resolvers.resolve(url, pubid, context)
 449      except:
 450          context._store_raised()
 451          return NULL
 452  
 453      if doc_ref is not None:
 454          if doc_ref._type == PARSER_DATA_STRING:
 455              data = doc_ref._data_bytes
 456              filename = doc_ref._filename
 457              if not filename:
 458                  filename = None
 459              elif not isinstance(filename, bytes):
 460                  # most likely a text URL
 461                  filename = filename.encode('utf8')
 462                  if not isinstance(filename, bytes):
 463                      filename = None
 464  
 465              c_input = xmlparser.xmlNewInputStream(c_context)
 466              if c_input is not NULL:
 467                  if filename is not None:
 468                      c_input.filename = <char *>tree.xmlStrdup(_xcstr(filename))
 469                  c_input.base = _xcstr(data)
 470                  c_input.length = python.PyBytes_GET_SIZE(data)
 471                  c_input.cur = c_input.base
 472                  c_input.end = c_input.base + c_input.length
 473          elif doc_ref._type == PARSER_DATA_FILENAME:
 474              data = None
 475              c_filename = _cstr(doc_ref._filename)
 476              with nogil:
 477                  # free the GIL as we might do serious I/O here
 478                  c_input = xmlparser.xmlNewInputFromFile(
 479                      c_context, c_filename)
 480          elif doc_ref._type == PARSER_DATA_FILE:
 481              file_context = _FileReaderContext(doc_ref._file, context, url,
 482                                                None, doc_ref._close_file)
 483              c_input = file_context._createParserInput(c_context)
 484              data = file_context
 485          else:
 486              data = None
 487              c_input = NULL
 488  
 489          if data is not None:
 490              context._storage.add(data)
 491          if c_input is not NULL:
 492              return c_input
 493  
 494      if __DEFAULT_ENTITY_LOADER is NULL:
 495          return NULL
 496  
 497      with nogil:
 498          # free the GIL as we might do serious I/O here (e.g. HTTP)
 499          c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
 500      return c_input
 501  
 502  cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER
 503  __DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader()
 504  
 505  
 506  cdef xmlparser.xmlExternalEntityLoader _register_document_loader() nogil:
 507      cdef xmlparser.xmlExternalEntityLoader old = xmlparser.xmlGetExternalEntityLoader()
 508      xmlparser.xmlSetExternalEntityLoader(<xmlparser.xmlExternalEntityLoader>_local_resolver)
 509      return old
 510  
 511  cdef void _reset_document_loader(xmlparser.xmlExternalEntityLoader old) nogil:
 512      xmlparser.xmlSetExternalEntityLoader(old)
 513  
 514  
 515  ############################################################
 516  ## Parsers
 517  ############################################################
 518  
 519  @cython.no_gc_clear  # May have to call "self._validator.disconnect()" on dealloc.
 520  @cython.internal
 521  cdef class _ParserContext(_ResolverContext):
 522      cdef _ErrorLog _error_log
 523      cdef _ParserSchemaValidationContext _validator
 524      cdef xmlparser.xmlParserCtxt* _c_ctxt
 525      cdef xmlparser.xmlExternalEntityLoader _orig_loader
 526      cdef python.PyThread_type_lock _lock
 527      cdef _Document _doc
 528      cdef bint _collect_ids
 529  
 530      def __cinit__(self):
 531          self._c_ctxt = NULL
 532          self._collect_ids = True
 533          if not config.ENABLE_THREADING:
 534              self._lock = NULL
 535          else:
 536              self._lock = python.PyThread_allocate_lock()
 537          self._error_log = _ErrorLog()
 538  
 539      def __dealloc__(self):
 540          if config.ENABLE_THREADING and self._lock is not NULL:
 541              python.PyThread_free_lock(self._lock)
 542              self._lock = NULL
 543          if self._c_ctxt is not NULL:
 544              if <void*>self._validator is not NULL and self._validator is not None:
 545                  # If the parser was not closed correctly (e.g. interrupted iterparse()),
 546                  # and the schema validator wasn't freed and cleaned up yet, the libxml2 SAX
 547                  # validator plug might still be in place, which will make xmlFreeParserCtxt()
 548                  # crash when trying to xmlFree() a static SAX handler.
 549                  # Thus, make sure we disconnect the handler interceptor here at the latest.
 550                  self._validator.disconnect()
 551              xmlparser.xmlFreeParserCtxt(self._c_ctxt)
 552  
 553      cdef _ParserContext _copy(self):
 554          cdef _ParserContext context
 555          context = self.__class__()
 556          context._collect_ids = self._collect_ids
 557          context._validator = self._validator.copy()
 558          _initParserContext(context, self._resolvers._copy(), NULL)
 559          return context
 560  
 561      cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt):
 562          self._c_ctxt = c_ctxt
 563          c_ctxt._private = <void*>self
 564  
 565      cdef void _resetParserContext(self):
 566          if self._c_ctxt is not NULL:
 567              if self._c_ctxt.html:
 568                  htmlparser.htmlCtxtReset(self._c_ctxt)
 569                  self._c_ctxt.disableSAX = 0 # work around bug in libxml2
 570              else:
 571                  xmlparser.xmlClearParserCtxt(self._c_ctxt)
 572                  # work around bug in libxml2 [2.9.10 .. 2.9.14]:
 573                  # https://gitlab.gnome.org/GNOME/libxml2/-/issues/378
 574                  self._c_ctxt.nsNr = 0
 575  
 576      cdef int prepare(self, bint set_document_loader=True) except -1:
 577          cdef int result
 578          if config.ENABLE_THREADING and self._lock is not NULL:
 579              with nogil:
 580                  result = python.PyThread_acquire_lock(
 581                      self._lock, python.WAIT_LOCK)
 582              if result == 0:
 583                  raise ParserError, u"parser locking failed"
 584          self._error_log.clear()
 585          self._doc = None
 586          self._c_ctxt.sax.serror = _receiveParserError
 587          self._orig_loader = _register_document_loader() if set_document_loader else NULL
 588          if self._validator is not None:
 589              self._validator.connect(self._c_ctxt, self._error_log)
 590          return 0
 591  
 592      cdef int cleanup(self) except -1:
 593          if self._orig_loader is not NULL:
 594              _reset_document_loader(self._orig_loader)
 595          try:
 596              if self._validator is not None:
 597                  self._validator.disconnect()
 598              self._resetParserContext()
 599              self.clear()
 600              self._doc = None
 601              self._c_ctxt.sax.serror = NULL
 602          finally:
 603              if config.ENABLE_THREADING and self._lock is not NULL:
 604                  python.PyThread_release_lock(self._lock)
 605          return 0
 606  
 607      cdef object _handleParseResult(self, _BaseParser parser,
 608                                     xmlDoc* result, filename):
 609          c_doc = self._handleParseResultDoc(parser, result, filename)
 610          if self._doc is not None and self._doc._c_doc is c_doc:
 611              return self._doc
 612          else:
 613              return _documentFactory(c_doc, parser)
 614  
 615      cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser,
 616                                         xmlDoc* result, filename) except NULL:
 617          recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
 618          return _handleParseResult(self, self._c_ctxt, result,
 619                                    filename, recover,
 620                                    free_doc=self._doc is None)
 621  
 622  cdef _initParserContext(_ParserContext context,
 623                          _ResolverRegistry resolvers,
 624                          xmlparser.xmlParserCtxt* c_ctxt):
 625      _initResolverContext(context, resolvers)
 626      if c_ctxt is not NULL:
 627          context._initParserContext(c_ctxt)
 628  
 629  cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, xmlerror.xmlError* error) with gil:
 630      (<_ParserContext>_parser_context._private)._error_log._receive(error)
 631  
 632  cdef void _receiveParserError(void* c_context, xmlerror.xmlError* error) nogil:
 633      if __DEBUG:
 634          if c_context is NULL or (<xmlparser.xmlParserCtxt*>c_context)._private is NULL:
 635              _forwardError(NULL, error)
 636          else:
 637              _forwardParserError(<xmlparser.xmlParserCtxt*>c_context, error)
 638  
 639  cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename,
 640                            _ErrorLog error_log) except -1:
 641      if filename is not None and \
 642             ctxt.lastError.domain == xmlerror.XML_FROM_IO:
 643          if isinstance(filename, bytes):
 644              filename = _decodeFilenameWithLength(
 645                  <bytes>filename, len(<bytes>filename))
 646          if ctxt.lastError.message is not NULL:
 647              try:
 648                  message = ctxt.lastError.message.decode('utf-8')
 649              except UnicodeDecodeError:
 650                  # the filename may be in there => play it safe
 651                  message = ctxt.lastError.message.decode('iso8859-1')
 652              message = f"Error reading file '{filename}': {message.strip()}"
 653          else:
 654              message = f"Error reading '{filename}'"
 655          raise IOError, message
 656      elif error_log:
 657          raise error_log._buildParseException(
 658              XMLSyntaxError, u"Document is not well formed")
 659      elif ctxt.lastError.message is not NULL:
 660          message = ctxt.lastError.message.strip()
 661          code = ctxt.lastError.code
 662          line = ctxt.lastError.line
 663          column = ctxt.lastError.int2
 664          if ctxt.lastError.line > 0:
 665              message = f"line {line}: {message}"
 666          raise XMLSyntaxError(message, code, line, column, filename)
 667      else:
 668          raise XMLSyntaxError(None, xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
 669                               filename)
 670  
 671  cdef xmlDoc* _handleParseResult(_ParserContext context,
 672                                  xmlparser.xmlParserCtxt* c_ctxt,
 673                                  xmlDoc* result, filename,
 674                                  bint recover, bint free_doc) except NULL:
 675      cdef bint well_formed
 676      if result is not NULL:
 677          __GLOBAL_PARSER_CONTEXT.initDocDict(result)
 678  
 679      if c_ctxt.myDoc is not NULL:
 680          if c_ctxt.myDoc is not result:
 681              __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc)
 682              tree.xmlFreeDoc(c_ctxt.myDoc)
 683          c_ctxt.myDoc = NULL
 684  
 685      if result is not NULL:
 686          if (context._validator is not None and
 687                  not context._validator.isvalid()):
 688              well_formed = 0  # actually not 'valid', but anyway ...
 689          elif (not c_ctxt.wellFormed and not c_ctxt.html and
 690                  c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and
 691                  [1 for error in context._error_log
 692                   if error.type == ErrorTypes.ERR_INVALID_CHAR]):
 693              # An encoding error occurred and libxml2 switched from UTF-8
 694              # input to (undecoded) Latin-1, at some arbitrary point in the
 695              # document.  Better raise an error than allowing for a broken
 696              # tree with mixed encodings.
 697              well_formed = 0
 698          elif recover or (c_ctxt.wellFormed and
 699                           c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
 700              well_formed = 1
 701          elif not c_ctxt.replaceEntities and not c_ctxt.validate \
 702                   and context is not None:
 703              # in this mode, we ignore errors about undefined entities
 704              for error in context._error_log.filter_from_errors():
 705                  if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
 706                         error.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
 707                      well_formed = 0
 708                      break
 709              else:
 710                  well_formed = 1
 711          else:
 712              well_formed = 0
 713  
 714          if not well_formed:
 715              if free_doc:
 716                  tree.xmlFreeDoc(result)
 717              result = NULL
 718  
 719      if context is not None and context._has_raised():
 720          if result is not NULL:
 721              if free_doc:
 722                  tree.xmlFreeDoc(result)
 723              result = NULL
 724          context._raise_if_stored()
 725  
 726      if result is NULL:
 727          if context is not None:
 728              _raiseParseError(c_ctxt, filename, context._error_log)
 729          else:
 730              _raiseParseError(c_ctxt, filename, None)
 731      else:
 732          if result.URL is NULL and filename is not None:
 733              result.URL = tree.xmlStrdup(_xcstr(filename))
 734          if result.encoding is NULL:
 735              result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
 736  
 737      if context._validator is not None and \
 738             context._validator._add_default_attributes:
 739          # we currently need to do this here as libxml2 does not
 740          # support inserting default attributes during parse-time
 741          # validation
 742          context._validator.inject_default_attributes(result)
 743  
 744      return result
 745  
 746  cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) nogil:
 747      cdef xmlNode* c_node
 748      if c_doc is NULL:
 749          return 0
 750      c_node = c_doc.children
 751      tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
 752      if c_node.type == tree.XML_ELEMENT_NODE:
 753          if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
 754              return -1
 755      tree.END_FOR_EACH_ELEMENT_FROM(c_node)
 756      return 0
 757  
 758  cdef int _fixHtmlDictSubtreeNames(tree.xmlDict* c_dict, xmlDoc* c_doc,
 759                                    xmlNode* c_start_node) nogil:
 760      """
 761      Move names to the dict, iterating in document order, starting at
 762      c_start_node. This is used in incremental parsing after each chunk.
 763      """
 764      cdef xmlNode* c_node
 765      if not c_doc:
 766          return 0
 767      if not c_start_node:
 768          return _fixHtmlDictNames(c_dict, c_doc)
 769      c_node = c_start_node
 770      tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
 771      if c_node.type == tree.XML_ELEMENT_NODE:
 772          if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
 773              return -1
 774      tree.END_FOR_EACH_ELEMENT_FROM(c_node)
 775      return 0
 776  
 777  cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
 778                                        xmlNode* c_node) nogil:
 779      cdef xmlNode* c_attr
 780      c_name = tree.xmlDictLookup(c_dict, c_node.name, -1)
 781      if c_name is NULL:
 782          return -1
 783      if c_name is not c_node.name:
 784          tree.xmlFree(<char*>c_node.name)
 785          c_node.name = c_name
 786      c_attr = <xmlNode*>c_node.properties
 787      while c_attr is not NULL:
 788          c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1)
 789          if c_name is NULL:
 790              return -1
 791          if c_name is not c_attr.name:
 792              tree.xmlFree(<char*>c_attr.name)
 793              c_attr.name = c_name
 794          c_attr = c_attr.next
 795      return 0
 796  
 797  @cython.internal
 798  cdef class _BaseParser:
 799      cdef ElementClassLookup _class_lookup
 800      cdef _ResolverRegistry _resolvers
 801      cdef _ParserContext _parser_context
 802      cdef _ParserContext _push_parser_context
 803      cdef int _parse_options
 804      cdef bint _for_html
 805      cdef bint _remove_comments
 806      cdef bint _remove_pis
 807      cdef bint _strip_cdata
 808      cdef bint _collect_ids
 809      cdef XMLSchema _schema
 810      cdef bytes _filename
 811      cdef readonly object target
 812      cdef object _default_encoding
 813      cdef tuple _events_to_collect  # (event_types, tag)
 814  
 815      def __init__(self, int parse_options, bint for_html, XMLSchema schema,
 816                   remove_comments, remove_pis, strip_cdata, collect_ids,
 817                   target, encoding):
 818          cdef tree.xmlCharEncodingHandler* enchandler
 819          cdef int c_encoding
 820          if not isinstance(self, (XMLParser, HTMLParser)):
 821              raise TypeError, u"This class cannot be instantiated"
 822  
 823          self._parse_options = parse_options
 824          self.target = target
 825          self._for_html = for_html
 826          self._remove_comments = remove_comments
 827          self._remove_pis = remove_pis
 828          self._strip_cdata = strip_cdata
 829          self._collect_ids = collect_ids
 830          self._schema = schema
 831  
 832          self._resolvers = _ResolverRegistry()
 833  
 834          if encoding is None:
 835              self._default_encoding = None
 836          else:
 837              encoding = _utf8(encoding)
 838              enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding))
 839              if enchandler is NULL:
 840                  raise LookupError, f"unknown encoding: '{encoding}'"
 841              tree.xmlCharEncCloseFunc(enchandler)
 842              self._default_encoding = encoding
 843  
 844      cdef _setBaseURL(self, base_url):
 845          self._filename = _encodeFilename(base_url)
 846  
 847      cdef _collectEvents(self, event_types, tag):
 848          if event_types is None:
 849              event_types = ()
 850          else:
 851              event_types = tuple(set(event_types))
 852              _buildParseEventFilter(event_types)  # purely for validation
 853          self._events_to_collect = (event_types, tag)
 854  
 855      cdef _ParserContext _getParserContext(self):
 856          cdef xmlparser.xmlParserCtxt* pctxt
 857          if self._parser_context is None:
 858              self._parser_context = self._createContext(self.target, None)
 859              self._parser_context._collect_ids = self._collect_ids
 860              if self._schema is not None:
 861                  self._parser_context._validator = \
 862                      self._schema._newSaxValidator(
 863                          self._parse_options & xmlparser.XML_PARSE_DTDATTR)
 864              pctxt = self._newParserCtxt()
 865              _initParserContext(self._parser_context, self._resolvers, pctxt)
 866              self._configureSaxContext(pctxt)
 867          return self._parser_context
 868  
 869      cdef _ParserContext _getPushParserContext(self):
 870          cdef xmlparser.xmlParserCtxt* pctxt
 871          if self._push_parser_context is None:
 872              self._push_parser_context = self._createContext(
 873                  self.target, self._events_to_collect)
 874              self._push_parser_context._collect_ids = self._collect_ids
 875              if self._schema is not None:
 876                  self._push_parser_context._validator = \
 877                      self._schema._newSaxValidator(
 878                          self._parse_options & xmlparser.XML_PARSE_DTDATTR)
 879              pctxt = self._newPushParserCtxt()
 880              _initParserContext(
 881                  self._push_parser_context, self._resolvers, pctxt)
 882              self._configureSaxContext(pctxt)
 883          return self._push_parser_context
 884  
 885      cdef _ParserContext _createContext(self, target, events_to_collect):
 886          cdef _SaxParserContext sax_context
 887          if target is not None:
 888              sax_context = _TargetParserContext(self)
 889              (<_TargetParserContext>sax_context)._setTarget(target)
 890          elif events_to_collect:
 891              sax_context = _SaxParserContext(self)
 892          else:
 893              # nothing special to configure
 894              return _ParserContext()
 895          if events_to_collect:
 896              events, tag = events_to_collect
 897              sax_context._setEventFilter(events, tag)
 898          return sax_context
 899  
 900      @cython.final
 901      cdef int _configureSaxContext(self, xmlparser.xmlParserCtxt* pctxt) except -1:
 902          if self._remove_comments:
 903              pctxt.sax.comment = NULL
 904          if self._remove_pis:
 905              pctxt.sax.processingInstruction = NULL
 906          if self._strip_cdata:
 907              # hard switch-off for CDATA nodes => makes them plain text
 908              pctxt.sax.cdataBlock = NULL
 909  
 910      cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1:
 911          cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax
 912          if sax is not NULL and sax.initialized and sax.initialized != xmlparser.XML_SAX2_MAGIC:
 913              # need to extend SAX1 context to SAX2 to get proper error reports
 914              if <xmlparser.xmlSAXHandlerV1*>sax is &htmlparser.htmlDefaultSAXHandler:
 915                  sax = <xmlparser.xmlSAXHandler*> tree.xmlMalloc(sizeof(xmlparser.xmlSAXHandler))
 916                  if sax is NULL:
 917                      raise MemoryError()
 918                  cstring_h.memcpy(sax, &htmlparser.htmlDefaultSAXHandler,
 919                                   sizeof(htmlparser.htmlDefaultSAXHandler))
 920                  c_ctxt.sax = sax
 921              sax.initialized = xmlparser.XML_SAX2_MAGIC
 922              sax.serror = _receiveParserError
 923              sax.startElementNs = NULL
 924              sax.endElementNs = NULL
 925              sax._private = NULL
 926          return 0
 927  
 928      cdef xmlparser.xmlParserCtxt* _newParserCtxt(self) except NULL:
 929          cdef xmlparser.xmlParserCtxt* c_ctxt
 930          if self._for_html:
 931              c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
 932              if c_ctxt is not NULL:
 933                  self._registerHtmlErrorHandler(c_ctxt)
 934          else:
 935              c_ctxt = xmlparser.xmlNewParserCtxt()
 936          if c_ctxt is NULL:
 937              raise MemoryError
 938          c_ctxt.sax.startDocument = _initSaxDocument
 939          return c_ctxt
 940  
 941      cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self) except NULL:
 942          cdef xmlparser.xmlParserCtxt* c_ctxt
 943          cdef char* c_filename = _cstr(self._filename) if self._filename is not None else NULL
 944          if self._for_html:
 945              c_ctxt = htmlparser.htmlCreatePushParserCtxt(
 946                  NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE)
 947              if c_ctxt is not NULL:
 948                  self._registerHtmlErrorHandler(c_ctxt)
 949                  htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options)
 950          else:
 951              c_ctxt = xmlparser.xmlCreatePushParserCtxt(
 952                  NULL, NULL, NULL, 0, c_filename)
 953              if c_ctxt is not NULL:
 954                  xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options)
 955          if c_ctxt is NULL:
 956              raise MemoryError()
 957          c_ctxt.sax.startDocument = _initSaxDocument
 958          return c_ctxt
 959  
 960      @property
 961      def error_log(self):
 962          """The error log of the last parser run.
 963          """
 964          cdef _ParserContext context
 965          context = self._getParserContext()
 966          return context._error_log.copy()
 967  
 968      @property
 969      def resolvers(self):
 970          """The custom resolver registry of this parser."""
 971          return self._resolvers
 972  
 973      @property
 974      def version(self):
 975          """The version of the underlying XML parser."""
 976          return u"libxml2 %d.%d.%d" % LIBXML_VERSION
 977  
 978      def setElementClassLookup(self, ElementClassLookup lookup = None):
 979          u":deprecated: use ``parser.set_element_class_lookup(lookup)`` instead."
 980          self.set_element_class_lookup(lookup)
 981  
 982      def set_element_class_lookup(self, ElementClassLookup lookup = None):
 983          u"""set_element_class_lookup(self, lookup = None)
 984  
 985          Set a lookup scheme for element classes generated from this parser.
 986  
 987          Reset it by passing None or nothing.
 988          """
 989          self._class_lookup = lookup
 990  
 991      cdef _BaseParser _copy(self):
 992          u"Create a new parser with the same configuration."
 993          cdef _BaseParser parser
 994          parser = self.__class__()
 995          parser._parse_options = self._parse_options
 996          parser._for_html = self._for_html
 997          parser._remove_comments = self._remove_comments
 998          parser._remove_pis = self._remove_pis
 999          parser._strip_cdata = self._strip_cdata
1000          parser._filename = self._filename
1001          parser._resolvers = self._resolvers
1002          parser.target = self.target
1003          parser._class_lookup  = self._class_lookup
1004          parser._default_encoding = self._default_encoding
1005          parser._schema = self._schema
1006          parser._events_to_collect = self._events_to_collect
1007          return parser
1008  
1009      def copy(self):
1010          u"""copy(self)
1011  
1012          Create a new parser with the same configuration.
1013          """
1014          return self._copy()
1015  
1016      def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
1017          u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
1018  
1019          Creates a new element associated with this parser.
1020          """
1021          return _makeElement(_tag, NULL, None, self, None, None,
1022                              attrib, nsmap, _extra)
1023  
1024      # internal parser methods
1025  
1026      cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL:
1027          u"""Parse unicode document, share dictionary if possible.
1028          """
1029          cdef _ParserContext context
1030          cdef xmlDoc* result
1031          cdef xmlparser.xmlParserCtxt* pctxt
1032          cdef Py_ssize_t py_buffer_len
1033          cdef int buffer_len, c_kind
1034          cdef const_char* c_text
1035          cdef const_char* c_encoding = _PY_UNICODE_ENCODING
1036          cdef bint is_pep393_string = (
1037              python.PEP393_ENABLED and python.PyUnicode_IS_READY(utext))
1038          if is_pep393_string:
1039              c_text = <const_char*>python.PyUnicode_DATA(utext)
1040              py_buffer_len = python.PyUnicode_GET_LENGTH(utext)
1041              c_kind = python.PyUnicode_KIND(utext)
1042              if c_kind == 1:
1043                  c_encoding = 'ISO-8859-1'
1044              elif c_kind == 2:
1045                  py_buffer_len *= 2
1046                  if python.PY_BIG_ENDIAN:
1047                      c_encoding = 'UTF-16BE'  # actually UCS-2
1048                  else:
1049                      c_encoding = 'UTF-16LE'  # actually UCS-2
1050              elif c_kind == 4:
1051                  py_buffer_len *= 4
1052                  if python.PY_BIG_ENDIAN:
1053                      c_encoding = 'UCS-4BE'
1054                  else:
1055                      c_encoding = 'UCS-4LE'
1056              else:
1057                  assert False, f"Illegal Unicode kind {c_kind}"
1058          else:
1059              py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext)
1060              c_text = python.PyUnicode_AS_DATA(utext)
1061          assert 0 <= py_buffer_len <= limits.INT_MAX
1062          buffer_len = py_buffer_len
1063  
1064          context = self._getParserContext()
1065          context.prepare()
1066          try:
1067              pctxt = context._c_ctxt
1068              __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1069              orig_options = pctxt.options
1070              with nogil:
1071                  if self._for_html:
1072                      result = htmlparser.htmlCtxtReadMemory(
1073                          pctxt, c_text, buffer_len, c_filename, c_encoding,
1074                          self._parse_options)
1075                      if result is not NULL:
1076                          if _fixHtmlDictNames(pctxt.dict, result) < 0:
1077                              tree.xmlFreeDoc(result)
1078                              result = NULL
1079                  else:
1080                      result = xmlparser.xmlCtxtReadMemory(
1081                          pctxt, c_text, buffer_len, c_filename, c_encoding,
1082                          self._parse_options)
1083              pctxt.options = orig_options # work around libxml2 problem
1084  
1085              return context._handleParseResultDoc(self, result, None)
1086          finally:
1087              context.cleanup()
1088  
1089      cdef xmlDoc* _parseDoc(self, char* c_text, int c_len,
1090                             char* c_filename) except NULL:
1091          u"""Parse document, share dictionary if possible.
1092          """
1093          cdef _ParserContext context
1094          cdef xmlDoc* result
1095          cdef xmlparser.xmlParserCtxt* pctxt
1096          cdef char* c_encoding
1097          cdef tree.xmlCharEncoding enc
1098          context = self._getParserContext()
1099          context.prepare()
1100          try:
1101              pctxt = context._c_ctxt
1102              __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1103  
1104              if self._default_encoding is None:
1105                  c_encoding = NULL
1106                  # libxml2 (at least 2.9.3) does not recognise UTF-32 BOMs
1107                  # NOTE: limit to problematic cases because it changes character offsets
1108                  if c_len >= 4 and (c_text[0] == '\xFF' and c_text[1] == '\xFE' and
1109                                     c_text[2] == 0 and c_text[3] == 0):
1110                      c_encoding = "UTF-32LE"
1111                      c_text += 4
1112                      c_len -= 4
1113                  elif c_len >= 4 and (c_text[0] == 0 and c_text[1] == 0 and
1114                                       c_text[2] == '\xFE' and c_text[3] == '\xFF'):
1115                      c_encoding = "UTF-32BE"
1116                      c_text += 4
1117                      c_len -= 4
1118                  else:
1119                      # no BOM => try to determine encoding
1120                      enc = tree.xmlDetectCharEncoding(<const_xmlChar*>c_text, c_len)
1121                      if enc == tree.XML_CHAR_ENCODING_UCS4LE:
1122                          c_encoding = 'UTF-32LE'
1123                      elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
1124                          c_encoding = 'UTF-32BE'
1125              else:
1126                  c_encoding = _cstr(self._default_encoding)
1127  
1128              orig_options = pctxt.options
1129              with nogil:
1130                  if self._for_html:
1131                      result = htmlparser.htmlCtxtReadMemory(
1132                          pctxt, c_text, c_len, c_filename,
1133                          c_encoding, self._parse_options)
1134                      if result is not NULL:
1135                          if _fixHtmlDictNames(pctxt.dict, result) < 0:
1136                              tree.xmlFreeDoc(result)
1137                              result = NULL
1138                  else:
1139                      result = xmlparser.xmlCtxtReadMemory(
1140                          pctxt, c_text, c_len, c_filename,
1141                          c_encoding, self._parse_options)
1142              pctxt.options = orig_options # work around libxml2 problem
1143  
1144              return context._handleParseResultDoc(self, result, None)
1145          finally:
1146              context.cleanup()
1147  
1148      cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL:
1149          cdef _ParserContext context
1150          cdef xmlDoc* result
1151          cdef xmlparser.xmlParserCtxt* pctxt
1152          cdef char* c_encoding
1153          result = NULL
1154  
1155          context = self._getParserContext()
1156          context.prepare()
1157          try:
1158              pctxt = context._c_ctxt
1159              __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1160  
1161              if self._default_encoding is None:
1162                  c_encoding = NULL
1163              else:
1164                  c_encoding = _cstr(self._default_encoding)
1165  
1166              orig_options = pctxt.options
1167              with nogil:
1168                  if self._for_html:
1169                      result = htmlparser.htmlCtxtReadFile(
1170                          pctxt, c_filename, c_encoding, self._parse_options)
1171                      if result is not NULL:
1172                          if _fixHtmlDictNames(pctxt.dict, result) < 0:
1173                              tree.xmlFreeDoc(result)
1174                              result = NULL
1175                  else:
1176                      result = xmlparser.xmlCtxtReadFile(
1177                          pctxt, c_filename, c_encoding, self._parse_options)
1178              pctxt.options = orig_options # work around libxml2 problem
1179  
1180              return context._handleParseResultDoc(self, result, c_filename)
1181          finally:
1182              context.cleanup()
1183  
1184      cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename,
1185                                         encoding) except NULL:
1186          cdef _ParserContext context
1187          cdef _FileReaderContext file_context
1188          cdef xmlDoc* result
1189          cdef xmlparser.xmlParserCtxt* pctxt
1190          cdef char* c_filename
1191          if not filename:
1192              filename = None
1193  
1194          context = self._getParserContext()
1195          context.prepare()
1196          try:
1197              pctxt = context._c_ctxt
1198              __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1199              file_context = _FileReaderContext(
1200                  filelike, context, filename,
1201                  encoding or self._default_encoding)
1202              result = file_context._readDoc(pctxt, self._parse_options)
1203  
1204              return context._handleParseResultDoc(
1205                  self, result, filename)
1206          finally:
1207              context.cleanup()
1208  
1209  
1210  cdef void _initSaxDocument(void* ctxt) with gil:
1211      xmlparser.xmlSAX2StartDocument(ctxt)
1212      c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
1213      c_doc = c_ctxt.myDoc
1214  
1215      # set up document dict
1216      if c_doc and c_ctxt.dict and not c_doc.dict:
1217          # I have no idea why libxml2 disables this - we need it
1218          c_ctxt.dictNames = 1
1219          c_doc.dict = c_ctxt.dict
1220          xmlparser.xmlDictReference(c_ctxt.dict)
1221  
1222      # set up XML ID hash table
1223      if c_ctxt._private:
1224          context = <_ParserContext>c_ctxt._private
1225          if context._collect_ids:
1226              # keep the global parser dict from filling up with XML IDs
1227              if c_doc and not c_doc.ids:
1228                  # memory errors are not fatal here
1229                  c_dict = xmlparser.xmlDictCreate()
1230                  if c_dict:
1231                      c_doc.ids = tree.xmlHashCreateDict(0, c_dict)
1232                      xmlparser.xmlDictFree(c_dict)
1233                  else:
1234                      c_doc.ids = tree.xmlHashCreate(0)
1235          else:
1236              c_ctxt.loadsubset |= xmlparser.XML_SKIP_IDS
1237              if c_doc and c_doc.ids and not tree.xmlHashSize(c_doc.ids):
1238                  # already initialised but empty => clear
1239                  tree.xmlHashFree(c_doc.ids, NULL)
1240                  c_doc.ids = NULL
1241  
1242  
1243  ############################################################
1244  ## ET feed parser
1245  ############################################################
1246  
1247  cdef class _FeedParser(_BaseParser):
1248      cdef bint _feed_parser_running
1249  
1250      @property
1251      def feed_error_log(self):
1252          """The error log of the last (or current) run of the feed parser.
1253  
1254          Note that this is local to the feed parser and thus is
1255          different from what the ``error_log`` property returns.
1256          """
1257          return self._getPushParserContext()._error_log.copy()
1258  
1259      cpdef feed(self, data):
1260          u"""feed(self, data)
1261  
1262          Feeds data to the parser.  The argument should be an 8-bit string
1263          buffer containing encoded data, although Unicode is supported as long
1264          as both string types are not mixed.
1265  
1266          This is the main entry point to the consumer interface of a
1267          parser.  The parser will parse as much of the XML stream as it
1268          can on each call.  To finish parsing or to reset the parser,
1269          call the ``close()`` method.  Both methods may raise
1270          ParseError if errors occur in the input data.  If an error is
1271          raised, there is no longer a need to call ``close()``.
1272  
1273          The feed parser interface is independent of the normal parser
1274          usage.  You can use the same parser as a feed parser and in
1275          the ``parse()`` function concurrently.
1276          """
1277          cdef _ParserContext context
1278          cdef bytes bstring
1279          cdef xmlparser.xmlParserCtxt* pctxt
1280          cdef Py_ssize_t py_buffer_len, ustart
1281          cdef const_char* char_data
1282          cdef const_char* c_encoding
1283          cdef int buffer_len
1284          cdef int error
1285          cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
1286  
1287          if isinstance(data, bytes):
1288              if self._default_encoding is None:
1289                  c_encoding = NULL
1290              else:
1291                  c_encoding = self._default_encoding
1292              char_data = _cstr(data)
1293              py_buffer_len = python.PyBytes_GET_SIZE(data)
1294              ustart = 0
1295          elif isinstance(data, unicode):
1296              c_encoding = b"UTF-8"
1297              char_data = NULL
1298              py_buffer_len = len(<unicode> data)
1299              ustart = 0
1300          else:
1301              raise TypeError, u"Parsing requires string data"
1302  
1303          context = self._getPushParserContext()
1304          pctxt = context._c_ctxt
1305          error = 0
1306          if not self._feed_parser_running:
1307              context.prepare(set_document_loader=False)
1308              self._feed_parser_running = 1
1309              c_filename = (_cstr(self._filename)
1310                            if self._filename is not None else NULL)
1311  
1312              # We have to give *mlCtxtResetPush() enough input to figure
1313              # out the character encoding (at least four bytes),
1314              # however if we give it all we got, we'll have nothing for
1315              # *mlParseChunk() and things go wrong.
1316              buffer_len = 0
1317              if char_data is not NULL:
1318                  buffer_len = 4 if py_buffer_len > 4 else <int>py_buffer_len
1319              orig_loader = _register_document_loader()
1320              if self._for_html:
1321                  error = _htmlCtxtResetPush(
1322                      pctxt, char_data, buffer_len, c_filename, c_encoding,
1323                      self._parse_options)
1324              else:
1325                  xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
1326                  error = xmlparser.xmlCtxtResetPush(
1327                      pctxt, char_data, buffer_len, c_filename, c_encoding)
1328              _reset_document_loader(orig_loader)
1329              py_buffer_len -= buffer_len
1330              char_data += buffer_len
1331              if error:
1332                  raise MemoryError()
1333              __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1334  
1335          #print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding
1336  
1337          fixup_error = 0
1338          while py_buffer_len > 0 and (error == 0 or recover):
1339              if char_data is NULL:
1340                  # Unicode parsing by converting chunks to UTF-8
1341                  buffer_len = 2**19  # len(bytes) <= 4 * (2**19) == 2 MiB
1342                  bstring = (<unicode> data)[ustart : ustart+buffer_len].encode('UTF-8')
1343                  ustart += buffer_len
1344                  py_buffer_len -= buffer_len  # may end up < 0
1345                  error, fixup_error = _parse_data_chunk(pctxt, <const char*> bstring, <int> len(bstring))
1346              else:
1347                  # Direct byte string parsing.
1348                  buffer_len = <int>py_buffer_len if py_buffer_len <= limits.INT_MAX else limits.INT_MAX
1349                  error, fixup_error = _parse_data_chunk(pctxt, char_data, buffer_len)
1350                  py_buffer_len -= buffer_len
1351                  char_data += buffer_len
1352  
1353              if fixup_error:
1354                  context.store_exception(MemoryError())
1355  
1356              if context._has_raised():
1357                  # propagate Python exceptions immediately
1358                  recover = 0
1359                  error = 1
1360                  break
1361  
1362              if error and not pctxt.replaceEntities and not pctxt.validate:
1363                  # in this mode, we ignore errors about undefined entities
1364                  for entry in context._error_log.filter_from_errors():
1365                      if entry.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
1366                             entry.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
1367                          break
1368                  else:
1369                      error = 0
1370  
1371          if not pctxt.wellFormed and pctxt.disableSAX and context._has_raised():
1372              # propagate Python exceptions immediately
1373              recover = 0
1374              error = 1
1375  
1376          if fixup_error or not recover and (error or not pctxt.wellFormed):
1377              self._feed_parser_running = 0
1378              try:
1379                  context._handleParseResult(self, pctxt.myDoc, None)
1380              finally:
1381                  context.cleanup()
1382  
1383      cpdef close(self):
1384          u"""close(self)
1385  
1386          Terminates feeding data to this parser.  This tells the parser to
1387          process any remaining data in the feed buffer, and then returns the
1388          root Element of the tree that was parsed.
1389  
1390          This method must be called after passing the last chunk of data into
1391          the ``feed()`` method.  It should only be called when using the feed
1392          parser interface, all other usage is undefined.
1393          """
1394          if not self._feed_parser_running:
1395              raise XMLSyntaxError(u"no element found",
1396                                   xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
1397                                   self._filename)
1398  
1399          context = self._getPushParserContext()
1400          pctxt = context._c_ctxt
1401  
1402          self._feed_parser_running = 0
1403          if self._for_html:
1404              htmlparser.htmlParseChunk(pctxt, NULL, 0, 1)
1405          else:
1406              xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
1407  
1408          if (pctxt.recovery and not pctxt.disableSAX and
1409                  isinstance(context, _SaxParserContext)):
1410              # apply any left-over 'end' events
1411              (<_SaxParserContext>context).flushEvents()
1412  
1413          try:
1414              result = context._handleParseResult(self, pctxt.myDoc, None)
1415          finally:
1416              context.cleanup()
1417  
1418          if isinstance(result, _Document):
1419              return (<_Document>result).getroot()
1420          else:
1421              return result
1422  
1423  
1424  cdef (int, int) _parse_data_chunk(xmlparser.xmlParserCtxt* c_ctxt,
1425                                    const char* char_data, int buffer_len):
1426      fixup_error = 0
1427      with nogil:
1428          if c_ctxt.html:
1429              c_node = c_ctxt.node  # last node where the parser stopped
1430              orig_loader = _register_document_loader()
1431              error = htmlparser.htmlParseChunk(c_ctxt, char_data, buffer_len, 0)
1432              _reset_document_loader(orig_loader)
1433              # and now for the fun part: move node names to the dict
1434              if c_ctxt.myDoc:
1435                  fixup_error = _fixHtmlDictSubtreeNames(
1436                      c_ctxt.dict, c_ctxt.myDoc, c_node)
1437                  if c_ctxt.myDoc.dict and c_ctxt.myDoc.dict is not c_ctxt.dict:
1438                      xmlparser.xmlDictFree(c_ctxt.myDoc.dict)
1439                      c_ctxt.myDoc.dict = c_ctxt.dict
1440                      xmlparser.xmlDictReference(c_ctxt.dict)
1441          else:
1442              orig_loader = _register_document_loader()
1443              error = xmlparser.xmlParseChunk(c_ctxt, char_data, buffer_len, 0)
1444              _reset_document_loader(orig_loader)
1445      return (error, fixup_error)
1446  
1447  
1448  cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
1449                               const_char* c_data, int buffer_len,
1450                               const_char* c_filename, const_char* c_encoding,
1451                               int parse_options) except -1:
1452      cdef xmlparser.xmlParserInput* c_input_stream
1453      # libxml2 lacks an HTML push parser setup function
1454      error = xmlparser.xmlCtxtResetPush(
1455          c_ctxt, c_data, buffer_len, c_filename, c_encoding)
1456      if error:
1457          return error
1458  
1459      # fix libxml2 setup for HTML
1460      c_ctxt.progressive = 1
1461      c_ctxt.html = 1
1462      htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options)
1463  
1464      return 0
1465  
1466  
1467  ############################################################
1468  ## XML parser
1469  ############################################################
1470  
1471  cdef int _XML_DEFAULT_PARSE_OPTIONS
1472  _XML_DEFAULT_PARSE_OPTIONS = (
1473      xmlparser.XML_PARSE_NOENT   |
1474      xmlparser.XML_PARSE_NOCDATA |
1475      xmlparser.XML_PARSE_NONET   |
1476      xmlparser.XML_PARSE_COMPACT |
1477      xmlparser.XML_PARSE_BIG_LINES
1478      )
1479  
1480  cdef class XMLParser(_FeedParser):
1481      u"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, schema: XMLSchema =None, huge_tree=False, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, collect_ids=True, target=None, compact=True)
1482  
1483      The XML parser.
1484  
1485      Parsers can be supplied as additional argument to various parse
1486      functions of the lxml API.  A default parser is always available
1487      and can be replaced by a call to the global function
1488      'set_default_parser'.  New parsers can be created at any time
1489      without a major run-time overhead.
1490  
1491      The keyword arguments in the constructor are mainly based on the
1492      libxml2 parser configuration.  A DTD will also be loaded if DTD
1493      validation or attribute default values are requested (unless you
1494      additionally provide an XMLSchema from which the default
1495      attributes can be read).
1496  
1497      Available boolean keyword arguments:
1498  
1499      - attribute_defaults - inject default attributes from DTD or XMLSchema
1500      - dtd_validation     - validate against a DTD referenced by the document
1501      - load_dtd           - use DTD for parsing
1502      - no_network         - prevent network access for related files (default: True)
1503      - ns_clean           - clean up redundant namespace declarations
1504      - recover            - try hard to parse through broken XML
1505      - remove_blank_text  - discard blank text nodes that appear ignorable
1506      - remove_comments    - discard comments
1507      - remove_pis         - discard processing instructions
1508      - strip_cdata        - replace CDATA sections by normal text content (default: True)
1509      - compact            - save memory for short text content (default: True)
1510      - collect_ids        - use a hash table of XML IDs for fast access (default: True, always True with DTD validation)
1511      - resolve_entities   - replace entities by their text value (default: True)
1512      - huge_tree          - disable security restrictions and support very deep trees
1513                             and very long text content (only affects libxml2 2.7+)
1514  
1515      Other keyword arguments:
1516  
1517      - encoding - override the document encoding
1518      - target   - a parser target object that will receive the parse events
1519      - schema   - an XMLSchema to validate against
1520  
1521      Note that you should avoid sharing parsers between threads.  While this is
1522      not harmful, it is more efficient to use separate parsers.  This does not
1523      apply to the default parser.
1524      """
1525      def __init__(self, *, encoding=None, attribute_defaults=False,
1526                   dtd_validation=False, load_dtd=False, no_network=True,
1527                   ns_clean=False, recover=False, XMLSchema schema=None,
1528                   huge_tree=False, remove_blank_text=False, resolve_entities=True,
1529                   remove_comments=False, remove_pis=False, strip_cdata=True,
1530                   collect_ids=True, target=None, compact=True):
1531          cdef int parse_options
1532          parse_options = _XML_DEFAULT_PARSE_OPTIONS
1533          if load_dtd:
1534              parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
1535          if dtd_validation:
1536              parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \
1537                              xmlparser.XML_PARSE_DTDLOAD
1538          if attribute_defaults:
1539              parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR
1540              if schema is None:
1541                  parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
1542          if ns_clean:
1543              parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN
1544          if recover:
1545              parse_options = parse_options | xmlparser.XML_PARSE_RECOVER
1546          if remove_blank_text:
1547              parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
1548          if huge_tree:
1549              parse_options = parse_options | xmlparser.XML_PARSE_HUGE
1550          if not no_network:
1551              parse_options = parse_options ^ xmlparser.XML_PARSE_NONET
1552          if not compact:
1553              parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
1554          if not resolve_entities:
1555              parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
1556          if not strip_cdata:
1557              parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
1558  
1559          _BaseParser.__init__(self, parse_options, 0, schema,
1560                               remove_comments, remove_pis, strip_cdata,
1561                               collect_ids, target, encoding)
1562  
1563  
1564  cdef class XMLPullParser(XMLParser):
1565      """XMLPullParser(self, events=None, *, tag=None, **kwargs)
1566  
1567      XML parser that collects parse events in an iterator.
1568  
1569      The collected events are the same as for iterparse(), but the
1570      parser itself is non-blocking in the sense that it receives
1571      data chunks incrementally through its .feed() method, instead
1572      of reading them directly from a file(-like) object all by itself.
1573  
1574      By default, it collects Element end events.  To change that,
1575      pass any subset of the available events into the ``events``
1576      argument: ``'start'``, ``'end'``, ``'start-ns'``,
1577      ``'end-ns'``, ``'comment'``, ``'pi'``.
1578  
1579      To support loading external dependencies relative to the input
1580      source, you can pass the ``base_url``.
1581      """
1582      def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
1583          XMLParser.__init__(self, **kwargs)
1584          if events is None:
1585              events = ('end',)
1586          self._setBaseURL(base_url)
1587          self._collectEvents(events, tag)
1588  
1589      def read_events(self):
1590          return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
1591  
1592  
1593  cdef class ETCompatXMLParser(XMLParser):
1594      u"""ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \
1595                   dtd_validation=False, load_dtd=False, no_network=True, \
1596                   ns_clean=False, recover=False, schema=None, \
1597                   huge_tree=False, remove_blank_text=False, resolve_entities=True, \
1598                   remove_comments=True, remove_pis=True, strip_cdata=True, \
1599                   target=None, compact=True)
1600  
1601      An XML parser with an ElementTree compatible default setup.
1602  
1603      See the XMLParser class for details.
1604  
1605      This parser has ``remove_comments`` and ``remove_pis`` enabled by default
1606      and thus ignores comments and processing instructions.
1607      """
1608      def __init__(self, *, encoding=None, attribute_defaults=False,
1609                   dtd_validation=False, load_dtd=False, no_network=True,
1610                   ns_clean=False, recover=False, schema=None,
1611                   huge_tree=False, remove_blank_text=False, resolve_entities=True,
1612                   remove_comments=True, remove_pis=True, strip_cdata=True,
1613                   target=None, compact=True):
1614          XMLParser.__init__(self,
1615                             attribute_defaults=attribute_defaults,
1616                             dtd_validation=dtd_validation,
1617                             load_dtd=load_dtd,
1618                             no_network=no_network,
1619                             ns_clean=ns_clean,
1620                             recover=recover,
1621                             remove_blank_text=remove_blank_text,
1622                             huge_tree=huge_tree,
1623                             compact=compact,
1624                             resolve_entities=resolve_entities,
1625                             remove_comments=remove_comments,
1626                             remove_pis=remove_pis,
1627                             strip_cdata=strip_cdata,
1628                             target=target,
1629                             encoding=encoding,
1630                             schema=schema)
1631  
1632  # ET 1.2 compatible name
1633  XMLTreeBuilder = ETCompatXMLParser
1634  
1635  
1636  cdef XMLParser __DEFAULT_XML_PARSER
1637  __DEFAULT_XML_PARSER = XMLParser()
1638  
1639  __GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER)
1640  
1641  def set_default_parser(_BaseParser parser=None):
1642      u"""set_default_parser(parser=None)
1643  
1644      Set a default parser for the current thread.  This parser is used
1645      globally whenever no parser is supplied to the various parse functions of
1646      the lxml API.  If this function is called without a parser (or if it is
1647      None), the default parser is reset to the original configuration.
1648  
1649      Note that the pre-installed default parser is not thread-safe.  Avoid the
1650      default parser in multi-threaded environments.  You can create a separate
1651      parser for each thread explicitly or use a parser pool.
1652      """
1653      if parser is None:
1654          parser = __DEFAULT_XML_PARSER
1655      __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser)
1656  
1657  def get_default_parser():
1658      u"get_default_parser()"
1659      return __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1660  
1661  ############################################################
1662  ## HTML parser
1663  ############################################################
1664  
1665  cdef int _HTML_DEFAULT_PARSE_OPTIONS
1666  _HTML_DEFAULT_PARSE_OPTIONS = (
1667      htmlparser.HTML_PARSE_RECOVER |
1668      htmlparser.HTML_PARSE_NONET   |
1669      htmlparser.HTML_PARSE_COMPACT
1670      )
1671  
1672  cdef class HTMLParser(_FeedParser):
1673      u"""HTMLParser(self, encoding=None, remove_blank_text=False, \
1674                     remove_comments=False, remove_pis=False, strip_cdata=True, \
1675                     no_network=True, target=None, schema: XMLSchema =None, \
1676                     recover=True, compact=True, collect_ids=True, huge_tree=False)
1677  
1678      The HTML parser.
1679  
1680      This parser allows reading HTML into a normal XML tree.  By
1681      default, it can read broken (non well-formed) HTML, depending on
1682      the capabilities of libxml2.  Use the 'recover' option to switch
1683      this off.
1684  
1685      Available boolean keyword arguments:
1686  
1687      - recover            - try hard to parse through broken HTML (default: True)
1688      - no_network         - prevent network access for related files (default: True)
1689      - remove_blank_text  - discard empty text nodes that are ignorable (i.e. not actual text content)
1690      - remove_comments    - discard comments
1691      - remove_pis         - discard processing instructions
1692      - strip_cdata        - replace CDATA sections by normal text content (default: True)
1693      - compact            - save memory for short text content (default: True)
1694      - default_doctype    - add a default doctype even if it is not found in the HTML (default: True)
1695      - collect_ids        - use a hash table of XML IDs for fast access (default: True)
1696      - huge_tree          - disable security restrictions and support very deep trees
1697                             and very long text content (only affects libxml2 2.7+)
1698  
1699      Other keyword arguments:
1700  
1701      - encoding - override the document encoding
1702      - target   - a parser target object that will receive the parse events
1703      - schema   - an XMLSchema to validate against
1704  
1705      Note that you should avoid sharing parsers between threads for performance
1706      reasons.
1707      """
1708      def __init__(self, *, encoding=None, remove_blank_text=False,
1709                   remove_comments=False, remove_pis=False, strip_cdata=True,
1710                   no_network=True, target=None, XMLSchema schema=None,
1711                   recover=True, compact=True, default_doctype=True,
1712                   collect_ids=True, huge_tree=False):
1713          cdef int parse_options
1714          parse_options = _HTML_DEFAULT_PARSE_OPTIONS
1715          if remove_blank_text:
1716              parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS
1717          if not recover:
1718              parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER
1719          if not no_network:
1720              parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET
1721          if not compact:
1722              parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
1723          if not default_doctype:
1724              parse_options = parse_options ^ htmlparser.HTML_PARSE_NODEFDTD
1725          if huge_tree:
1726              parse_options = parse_options | xmlparser.XML_PARSE_HUGE
1727  
1728          _BaseParser.__init__(self, parse_options, 1, schema,
1729                               remove_comments, remove_pis, strip_cdata,
1730                               collect_ids, target, encoding)
1731  
1732  
1733  cdef HTMLParser __DEFAULT_HTML_PARSER
1734  __DEFAULT_HTML_PARSER = HTMLParser()
1735  
1736  
1737  cdef class HTMLPullParser(HTMLParser):
1738      """HTMLPullParser(self, events=None, *, tag=None, base_url=None, **kwargs)
1739  
1740      HTML parser that collects parse events in an iterator.
1741  
1742      The collected events are the same as for iterparse(), but the
1743      parser itself is non-blocking in the sense that it receives
1744      data chunks incrementally through its .feed() method, instead
1745      of reading them directly from a file(-like) object all by itself.
1746  
1747      By default, it collects Element end events.  To change that,
1748      pass any subset of the available events into the ``events``
1749      argument: ``'start'``, ``'end'``, ``'start-ns'``,
1750      ``'end-ns'``, ``'comment'``, ``'pi'``.
1751  
1752      To support loading external dependencies relative to the input
1753      source, you can pass the ``base_url``.
1754      """
1755      def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
1756          HTMLParser.__init__(self, **kwargs)
1757          if events is None:
1758              events = ('end',)
1759          self._setBaseURL(base_url)
1760          self._collectEvents(events, tag)
1761  
1762      def read_events(self):
1763          return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
1764  
1765  
1766  ############################################################
1767  ## helper functions for document creation
1768  ############################################################
1769  
1770  cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
1771      cdef char* c_filename
1772      cdef char* c_text
1773      cdef Py_ssize_t c_len
1774      cdef bint is_pep393_string
1775      if parser is None:
1776          parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1777      if not filename:
1778          c_filename = NULL
1779      else:
1780          filename_utf = _encodeFilenameUTF8(filename)
1781          c_filename = _cstr(filename_utf)
1782      if isinstance(text, unicode):
1783          is_pep393_string = (
1784              python.PEP393_ENABLED and python.PyUnicode_IS_READY(text))
1785          if is_pep393_string:
1786              c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text)
1787          else:
1788              c_len = python.PyUnicode_GET_DATA_SIZE(text)
1789          if c_len > limits.INT_MAX:
1790              return (<_BaseParser>parser)._parseDocFromFilelike(
1791                  StringIO(text), filename, None)
1792          if _PY_UNICODE_ENCODING is NULL and not is_pep393_string:
1793              text = (<unicode>text).encode('utf8')
1794              return (<_BaseParser>parser)._parseDocFromFilelike(
1795                  BytesIO(text), filename, "UTF-8")
1796          return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename)
1797      else:
1798          c_len = python.PyBytes_GET_SIZE(text)
1799          if c_len > limits.INT_MAX:
1800              return (<_BaseParser>parser)._parseDocFromFilelike(
1801                  BytesIO(text), filename, None)
1802          c_text = _cstr(text)
1803          return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename)
1804  
1805  cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
1806      if parser is None:
1807          parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1808      return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
1809  
1810  cdef xmlDoc* _parseDocFromFilelike(source, filename,
1811                                     _BaseParser parser) except NULL:
1812      if parser is None:
1813          parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1814      return (<_BaseParser>parser)._parseDocFromFilelike(source, filename, None)
1815  
1816  cdef xmlDoc* _newXMLDoc() except NULL:
1817      cdef xmlDoc* result
1818      result = tree.xmlNewDoc(NULL)
1819      if result is NULL:
1820          raise MemoryError()
1821      if result.encoding is NULL:
1822          result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
1823      __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1824      return result
1825  
1826  cdef xmlDoc* _newHTMLDoc() except NULL:
1827      cdef xmlDoc* result
1828      result = tree.htmlNewDoc(NULL, NULL)
1829      if result is NULL:
1830          raise MemoryError()
1831      __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1832      return result
1833  
1834  cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL:
1835      cdef xmlDoc* result
1836      if recursive:
1837          with nogil:
1838              result = tree.xmlCopyDoc(c_doc, recursive)
1839      else:
1840          result = tree.xmlCopyDoc(c_doc, 0)
1841      if result is NULL:
1842          raise MemoryError()
1843      __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1844      return result
1845  
1846  cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL:
1847      u"Recursively copy the document and make c_new_root the new root node."
1848      cdef xmlDoc* result
1849      cdef xmlNode* c_node
1850      result = tree.xmlCopyDoc(c_doc, 0) # non recursive
1851      __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1852      with nogil:
1853          c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive
1854      if c_node is NULL:
1855          raise MemoryError()
1856      tree.xmlDocSetRootElement(result, c_node)
1857      _copyTail(c_new_root.next, c_node)
1858      return result
1859  
1860  cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL:
1861      u"Recursively copy the element into the document. c_doc is not modified."
1862      cdef xmlNode* c_root
1863      c_root = tree.xmlDocCopyNode(c_node, c_doc, 1) # recursive
1864      if c_root is NULL:
1865          raise MemoryError()
1866      _copyTail(c_node.next, c_root)
1867      return c_root
1868  
1869  
1870  ############################################################
1871  ## API level helper functions for _Document creation
1872  ############################################################
1873  
1874  cdef _Document _parseDocument(source, _BaseParser parser, base_url):
1875      cdef _Document doc
1876      source = _getFSPathOrObject(source)
1877      if _isString(source):
1878          # parse the file directly from the filesystem
1879          doc = _parseDocumentFromURL(_encodeFilename(source), parser)
1880          # fix base URL if requested
1881          if base_url is not None:
1882              base_url = _encodeFilenameUTF8(base_url)
1883              if doc._c_doc.URL is not NULL:
1884                  tree.xmlFree(<char*>doc._c_doc.URL)
1885              doc._c_doc.URL = tree.xmlStrdup(_xcstr(base_url))
1886          return doc
1887  
1888      if base_url is not None:
1889          url = base_url
1890      else:
1891          url = _getFilenameForFile(source)
1892  
1893      if hasattr(source, u'getvalue') and hasattr(source, u'tell'):
1894          # StringIO - reading from start?
1895          if source.tell() == 0:
1896              return _parseMemoryDocument(source.getvalue(), url, parser)
1897  
1898      # Support for file-like objects (urlgrabber.urlopen, ...)
1899      if hasattr(source, u'read'):
1900          return _parseFilelikeDocument(source, url, parser)
1901  
1902      raise TypeError, f"cannot parse from '{python._fqtypename(source).decode('UTF-8')}'"
1903  
1904  cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
1905      c_doc = _parseDocFromFile(url, parser)
1906      return _documentFactory(c_doc, parser)
1907  
1908  cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
1909      if isinstance(text, unicode):
1910          if _hasEncodingDeclaration(text):
1911              raise ValueError(
1912                  u"Unicode strings with encoding declaration are not supported. "
1913                  u"Please use bytes input or XML fragments without declaration.")
1914      elif not isinstance(text, bytes):
1915          raise ValueError, u"can only parse strings"
1916      c_doc = _parseDoc(text, url, parser)
1917      return _documentFactory(c_doc, parser)
1918  
1919  cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser):
1920      c_doc = _parseDocFromFilelike(source, url, parser)
1921      return _documentFactory(c_doc, parser)