/ lib / lxml / saxparser.pxi
saxparser.pxi
  1  # SAX-like interfaces
  2  
  3  class XMLSyntaxAssertionError(XMLSyntaxError, AssertionError):
  4      """
  5      An XMLSyntaxError that additionally inherits from AssertionError for
  6      ElementTree / backwards compatibility reasons.
  7  
  8      This class may get replaced by a plain XMLSyntaxError in a future version.
  9      """
 10  
 11  
 12  ctypedef enum _SaxParserEvents:
 13      SAX_EVENT_START    = 1 << 0
 14      SAX_EVENT_END      = 1 << 1
 15      SAX_EVENT_DATA     = 1 << 2
 16      SAX_EVENT_DOCTYPE  = 1 << 3
 17      SAX_EVENT_PI       = 1 << 4
 18      SAX_EVENT_COMMENT  = 1 << 5
 19      SAX_EVENT_START_NS = 1 << 6
 20      SAX_EVENT_END_NS   = 1 << 7
 21  
 22  ctypedef enum _ParseEventFilter:
 23      PARSE_EVENT_FILTER_START     = 1 << 0
 24      PARSE_EVENT_FILTER_END       = 1 << 1
 25      PARSE_EVENT_FILTER_START_NS  = 1 << 2
 26      PARSE_EVENT_FILTER_END_NS    = 1 << 3
 27      PARSE_EVENT_FILTER_COMMENT   = 1 << 4
 28      PARSE_EVENT_FILTER_PI        = 1 << 5
 29  
 30  
 31  cdef int _buildParseEventFilter(events) except -1:
 32      cdef int event_filter
 33      event_filter = 0
 34      for event in events:
 35          if event == 'start':
 36              event_filter |= PARSE_EVENT_FILTER_START
 37          elif event == 'end':
 38              event_filter |= PARSE_EVENT_FILTER_END
 39          elif event == 'start-ns':
 40              event_filter |= PARSE_EVENT_FILTER_START_NS
 41          elif event == 'end-ns':
 42              event_filter |= PARSE_EVENT_FILTER_END_NS
 43          elif event == 'comment':
 44              event_filter |= PARSE_EVENT_FILTER_COMMENT
 45          elif event == 'pi':
 46              event_filter |= PARSE_EVENT_FILTER_PI
 47          else:
 48              raise ValueError, f"invalid event name '{event}'"
 49      return event_filter
 50  
 51  
 52  cdef class _SaxParserTarget:
 53      cdef int _sax_event_filter
 54      def __cinit__(self):
 55          self._sax_event_filter = 0
 56  
 57      cdef _handleSaxStart(self, tag, attrib, nsmap):
 58          return None
 59      cdef _handleSaxEnd(self, tag):
 60          return None
 61      cdef int _handleSaxData(self, data) except -1:
 62          return 0
 63      cdef int _handleSaxDoctype(self, root_tag, public_id, system_id) except -1:
 64          return 0
 65      cdef _handleSaxPi(self, target, data):
 66          return None
 67      cdef _handleSaxComment(self, comment):
 68          return None
 69      cdef _handleSaxStartNs(self, prefix, uri):
 70          return None
 71      cdef _handleSaxEndNs(self, prefix):
 72          return None
 73  
 74  
 75  #@cython.final
 76  @cython.internal
 77  @cython.no_gc_clear  # Required because parent class uses it - Cython bug.
 78  cdef class _SaxParserContext(_ParserContext):
 79      u"""This class maps SAX2 events to parser target events.
 80      """
 81      cdef _SaxParserTarget _target
 82      cdef _BaseParser _parser
 83      cdef xmlparser.startElementNsSAX2Func _origSaxStart
 84      cdef xmlparser.endElementNsSAX2Func   _origSaxEnd
 85      cdef xmlparser.startElementSAXFunc    _origSaxStartNoNs
 86      cdef xmlparser.endElementSAXFunc      _origSaxEndNoNs
 87      cdef xmlparser.charactersSAXFunc      _origSaxData
 88      cdef xmlparser.cdataBlockSAXFunc      _origSaxCData
 89      cdef xmlparser.internalSubsetSAXFunc  _origSaxDoctype
 90      cdef xmlparser.commentSAXFunc         _origSaxComment
 91      cdef xmlparser.processingInstructionSAXFunc _origSaxPI
 92      cdef xmlparser.startDocumentSAXFunc   _origSaxStartDocument
 93  
 94      # for event collecting
 95      cdef int _event_filter
 96      cdef list _ns_stack
 97      cdef list _node_stack
 98      cdef _ParseEventsIterator events_iterator
 99  
100      # for iterparse
101      cdef _Element  _root
102      cdef _MultiTagMatcher _matcher
103  
104      def __cinit__(self, _BaseParser parser):
105          self._ns_stack = []
106          self._node_stack = []
107          self._parser = parser
108          self.events_iterator = _ParseEventsIterator()
109  
110      cdef void _setSaxParserTarget(self, _SaxParserTarget target):
111          self._target = target
112  
113      cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt):
114          _ParserContext._initParserContext(self, c_ctxt)
115          if self._target is not None:
116              self._connectTarget(c_ctxt)
117          elif self._event_filter:
118              self._connectEvents(c_ctxt)
119  
120      cdef void _connectTarget(self, xmlparser.xmlParserCtxt* c_ctxt):
121          """Wrap original SAX2 callbacks to call into parser target.
122          """
123          sax = c_ctxt.sax
124          self._origSaxStart = sax.startElementNs = NULL
125          self._origSaxStartNoNs = sax.startElement = NULL
126          if self._target._sax_event_filter & (SAX_EVENT_START |
127                                               SAX_EVENT_START_NS |
128                                               SAX_EVENT_END_NS):
129              # intercept => overwrite orig callback
130              # FIXME: also intercept on when collecting END events
131              if sax.initialized == xmlparser.XML_SAX2_MAGIC:
132                  sax.startElementNs = _handleSaxTargetStart
133              if self._target._sax_event_filter & SAX_EVENT_START:
134                  sax.startElement = _handleSaxTargetStartNoNs
135  
136          self._origSaxEnd = sax.endElementNs = NULL
137          self._origSaxEndNoNs = sax.endElement = NULL
138          if self._target._sax_event_filter & (SAX_EVENT_END |
139                                               SAX_EVENT_END_NS):
140              if sax.initialized == xmlparser.XML_SAX2_MAGIC:
141                  sax.endElementNs = _handleSaxEnd
142              if self._target._sax_event_filter & SAX_EVENT_END:
143                  sax.endElement = _handleSaxEndNoNs
144  
145          self._origSaxData = sax.characters = sax.cdataBlock = NULL
146          if self._target._sax_event_filter & SAX_EVENT_DATA:
147              sax.characters = sax.cdataBlock = _handleSaxData
148  
149          # doctype propagation is always required for entity replacement
150          self._origSaxDoctype = sax.internalSubset
151          if self._target._sax_event_filter & SAX_EVENT_DOCTYPE:
152              sax.internalSubset = _handleSaxTargetDoctype
153  
154          self._origSaxPI = sax.processingInstruction = NULL
155          if self._target._sax_event_filter & SAX_EVENT_PI:
156              sax.processingInstruction = _handleSaxTargetPI
157  
158          self._origSaxComment = sax.comment = NULL
159          if self._target._sax_event_filter & SAX_EVENT_COMMENT:
160              sax.comment = _handleSaxTargetComment
161  
162          # enforce entity replacement
163          sax.reference = NULL
164          c_ctxt.replaceEntities = 1
165  
166      cdef void _connectEvents(self, xmlparser.xmlParserCtxt* c_ctxt):
167          """Wrap original SAX2 callbacks to collect parse events without parser target.
168          """
169          sax = c_ctxt.sax
170          self._origSaxStartDocument = sax.startDocument
171          sax.startDocument = _handleSaxStartDocument
172  
173          # only override "start" event handler if needed
174          self._origSaxStart = sax.startElementNs
175          if self._event_filter == 0 or c_ctxt.html or \
176                 self._event_filter & (PARSE_EVENT_FILTER_START |
177                                       PARSE_EVENT_FILTER_END |
178                                       PARSE_EVENT_FILTER_START_NS |
179                                       PARSE_EVENT_FILTER_END_NS):
180              sax.startElementNs = <xmlparser.startElementNsSAX2Func>_handleSaxStart
181  
182          self._origSaxStartNoNs = sax.startElement
183          if self._event_filter == 0 or c_ctxt.html or \
184                 self._event_filter & (PARSE_EVENT_FILTER_START |
185                                       PARSE_EVENT_FILTER_END):
186              sax.startElement = <xmlparser.startElementSAXFunc>_handleSaxStartNoNs
187  
188          # only override "end" event handler if needed
189          self._origSaxEnd = sax.endElementNs
190          if self._event_filter == 0 or \
191                 self._event_filter & (PARSE_EVENT_FILTER_END |
192                                       PARSE_EVENT_FILTER_END_NS):
193              sax.endElementNs = <xmlparser.endElementNsSAX2Func>_handleSaxEnd
194  
195          self._origSaxEndNoNs = sax.endElement
196          if self._event_filter == 0 or \
197                 self._event_filter & PARSE_EVENT_FILTER_END:
198              sax.endElement = <xmlparser.endElementSAXFunc>_handleSaxEndNoNs
199  
200          self._origSaxComment = sax.comment
201          if self._event_filter & PARSE_EVENT_FILTER_COMMENT:
202              sax.comment = <xmlparser.commentSAXFunc>_handleSaxComment
203  
204          self._origSaxPI = sax.processingInstruction
205          if self._event_filter & PARSE_EVENT_FILTER_PI:
206              sax.processingInstruction = <xmlparser.processingInstructionSAXFunc>_handleSaxPIEvent
207  
208      cdef _setEventFilter(self, events, tag):
209          self._event_filter = _buildParseEventFilter(events)
210          if not self._event_filter or tag is None or tag == '*':
211              self._matcher = None
212          else:
213              self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag)
214  
215      cdef int startDocument(self, xmlDoc* c_doc) except -1:
216          try:
217              self._doc = _documentFactory(c_doc, self._parser)
218          finally:
219              self._parser = None  # clear circular reference ASAP
220          if self._matcher is not None:
221              self._matcher.cacheTags(self._doc, True) # force entry in libxml2 dict
222          return 0
223  
224      cdef int pushEvent(self, event, xmlNode* c_node) except -1:
225          cdef _Element root
226          if self._root is None:
227              root = self._doc.getroot()
228              if root is not None and root._c_node.type == tree.XML_ELEMENT_NODE:
229                  self._root = root
230          node = _elementFactory(self._doc, c_node)
231          self.events_iterator._events.append( (event, node) )
232          return 0
233  
234      cdef int flushEvents(self) except -1:
235          events = self.events_iterator._events
236          while self._node_stack:
237              events.append( ('end', self._node_stack.pop()) )
238              _pushSaxNsEndEvents(self)
239          while self._ns_stack:
240              _pushSaxNsEndEvents(self)
241  
242      cdef void _handleSaxException(self, xmlparser.xmlParserCtxt* c_ctxt):
243          if c_ctxt.errNo == xmlerror.XML_ERR_OK:
244              c_ctxt.errNo = xmlerror.XML_ERR_INTERNAL_ERROR
245          # stop parsing immediately
246          c_ctxt.wellFormed = 0
247          c_ctxt.disableSAX = 1
248          c_ctxt.instate = xmlparser.XML_PARSER_EOF
249          self._store_raised()
250  
251  
252  @cython.final
253  @cython.internal
254  cdef class _ParseEventsIterator:
255      """A reusable parse events iterator"""
256      cdef list _events
257      cdef int _event_index
258  
259      def __cinit__(self):
260          self._events = []
261          self._event_index = 0
262  
263      def __iter__(self):
264          return self
265  
266      def __next__(self):
267          cdef int event_index = self._event_index
268          events = self._events
269          if event_index >= 2**10 or event_index * 2 >= len(events):
270              if event_index:
271                  # clean up from time to time
272                  del events[:event_index]
273                  self._event_index = event_index = 0
274              if event_index >= len(events):
275                  raise StopIteration
276          item = events[event_index]
277          self._event_index = event_index + 1
278          return item
279  
280  
281  cdef list _build_prefix_uri_list(_SaxParserContext context, int c_nb_namespaces,
282                                   const_xmlChar** c_namespaces):
283      "Build [(prefix, uri)] list of declared namespaces."
284      cdef int i
285      namespaces = []
286      for i in xrange(c_nb_namespaces):
287          namespaces.append((funicodeOrEmpty(c_namespaces[0]), funicode(c_namespaces[1])))
288          c_namespaces += 2
289      return namespaces
290  
291  
292  cdef void _handleSaxStart(
293          void* ctxt, const_xmlChar* c_localname, const_xmlChar* c_prefix,
294          const_xmlChar* c_namespace, int c_nb_namespaces,
295          const_xmlChar** c_namespaces,
296          int c_nb_attributes, int c_nb_defaulted,
297          const_xmlChar** c_attributes) with gil:
298      cdef int i
299      cdef size_t c_len
300      c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
301      if c_ctxt._private is NULL or c_ctxt.disableSAX:
302          return
303      context = <_SaxParserContext>c_ctxt._private
304      cdef int event_filter = context._event_filter
305      try:
306          if (c_nb_namespaces and
307                  event_filter & (PARSE_EVENT_FILTER_START_NS |
308                                  PARSE_EVENT_FILTER_END_NS)):
309              declared_namespaces = _build_prefix_uri_list(
310                  context, c_nb_namespaces, c_namespaces)
311              if event_filter & PARSE_EVENT_FILTER_START_NS:
312                  for prefix_uri_tuple in declared_namespaces:
313                      context.events_iterator._events.append(("start-ns", prefix_uri_tuple))
314          else:
315              declared_namespaces = None
316  
317          context._origSaxStart(c_ctxt, c_localname, c_prefix, c_namespace,
318                                c_nb_namespaces, c_namespaces, c_nb_attributes,
319                                c_nb_defaulted, c_attributes)
320          if c_ctxt.html:
321              _fixHtmlDictNodeNames(c_ctxt.dict, c_ctxt.node)
322  
323          if event_filter & PARSE_EVENT_FILTER_END_NS:
324              context._ns_stack.append(declared_namespaces)
325          if event_filter & (PARSE_EVENT_FILTER_END |
326                             PARSE_EVENT_FILTER_START):
327              _pushSaxStartEvent(context, c_ctxt, c_namespace, c_localname, None)
328      except:
329          context._handleSaxException(c_ctxt)
330      finally:
331          return  # swallow any further exceptions
332  
333  
334  cdef void _handleSaxTargetStart(
335          void* ctxt, const_xmlChar* c_localname, const_xmlChar* c_prefix,
336          const_xmlChar* c_namespace, int c_nb_namespaces,
337          const_xmlChar** c_namespaces,
338          int c_nb_attributes, int c_nb_defaulted,
339          const_xmlChar** c_attributes) with gil:
340      cdef int i
341      cdef size_t c_len
342      c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
343      if c_ctxt._private is NULL or c_ctxt.disableSAX:
344          return
345      context = <_SaxParserContext>c_ctxt._private
346  
347      cdef int event_filter = context._event_filter
348      cdef int sax_event_filter = context._target._sax_event_filter
349      try:
350          if c_nb_namespaces:
351              declared_namespaces = _build_prefix_uri_list(
352                  context, c_nb_namespaces, c_namespaces)
353  
354              if event_filter & PARSE_EVENT_FILTER_START_NS:
355                  for prefix_uri_tuple in declared_namespaces:
356                      context.events_iterator._events.append(("start-ns", prefix_uri_tuple))
357  
358              if sax_event_filter & SAX_EVENT_START_NS:
359                  for prefix, uri in declared_namespaces:
360                      context._target._handleSaxStartNs(prefix, uri)
361                  #if not context._target._sax_event_filter & SAX_EVENT_START:
362                  #    # *Only* collecting start-ns events.
363                  #    return
364          else:
365              declared_namespaces = None
366  
367          if sax_event_filter & SAX_EVENT_START:
368              if c_nb_defaulted > 0:
369                  # only add default attributes if we asked for them
370                  if c_ctxt.loadsubset & xmlparser.XML_COMPLETE_ATTRS == 0:
371                      c_nb_attributes -= c_nb_defaulted
372              if c_nb_attributes == 0:
373                  attrib = IMMUTABLE_EMPTY_MAPPING
374              else:
375                  attrib = {}
376                  for i in xrange(c_nb_attributes):
377                      name = _namespacedNameFromNsName(
378                          c_attributes[2], c_attributes[0])
379                      if c_attributes[3] is NULL:
380                          value = ''
381                      else:
382                          c_len = c_attributes[4] - c_attributes[3]
383                          value = c_attributes[3][:c_len].decode('utf8')
384                      attrib[name] = value
385                      c_attributes += 5
386  
387              nsmap = dict(declared_namespaces) if c_nb_namespaces else IMMUTABLE_EMPTY_MAPPING
388  
389              element = _callTargetSaxStart(
390                  context, c_ctxt,
391                  _namespacedNameFromNsName(c_namespace, c_localname),
392                  attrib, nsmap)
393          else:
394              element = None
395  
396          if (event_filter & PARSE_EVENT_FILTER_END_NS or
397                  sax_event_filter & SAX_EVENT_END_NS):
398              context._ns_stack.append(declared_namespaces)
399          if event_filter & (PARSE_EVENT_FILTER_END |
400                             PARSE_EVENT_FILTER_START):
401              _pushSaxStartEvent(context, c_ctxt, c_namespace,
402                                 c_localname, element)
403      except:
404          context._handleSaxException(c_ctxt)
405      finally:
406          return  # swallow any further exceptions
407  
408  
409  cdef void _handleSaxStartNoNs(void* ctxt, const_xmlChar* c_name,
410                                const_xmlChar** c_attributes) with gil:
411      c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
412      if c_ctxt._private is NULL or c_ctxt.disableSAX:
413          return
414      context = <_SaxParserContext>c_ctxt._private
415      try:
416          context._origSaxStartNoNs(c_ctxt, c_name, c_attributes)
417          if c_ctxt.html:
418              _fixHtmlDictNodeNames(c_ctxt.dict, c_ctxt.node)
419          if context._event_filter & (PARSE_EVENT_FILTER_END |
420                                      PARSE_EVENT_FILTER_START):
421              _pushSaxStartEvent(context, c_ctxt, NULL, c_name, None)
422      except:
423          context._handleSaxException(c_ctxt)
424      finally:
425          return  # swallow any further exceptions
426  
427  
428  cdef void _handleSaxTargetStartNoNs(void* ctxt, const_xmlChar* c_name,
429                                      const_xmlChar** c_attributes) with gil:
430      c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
431      if c_ctxt._private is NULL or c_ctxt.disableSAX:
432          return
433      context = <_SaxParserContext>c_ctxt._private
434      try:
435          if c_attributes is NULL:
436              attrib = IMMUTABLE_EMPTY_MAPPING
437          else:
438              attrib = {}
439              while c_attributes[0] is not NULL:
440                  name = funicode(c_attributes[0])
441                  attrib[name] = funicodeOrEmpty(c_attributes[1])
442                  c_attributes += 2
443          element = _callTargetSaxStart(
444              context, c_ctxt, funicode(c_name),
445              attrib, IMMUTABLE_EMPTY_MAPPING)
446          if context._event_filter & (PARSE_EVENT_FILTER_END |
447                                      PARSE_EVENT_FILTER_START):
448              _pushSaxStartEvent(context, c_ctxt, NULL, c_name, element)
449      except:
450          context._handleSaxException(c_ctxt)
451      finally:
452          return  # swallow any further exceptions
453  
454  
455  cdef _callTargetSaxStart(_SaxParserContext context,
456                           xmlparser.xmlParserCtxt* c_ctxt,
457                           tag, attrib, nsmap):
458      element = context._target._handleSaxStart(tag, attrib, nsmap)
459      if element is not None and c_ctxt.input is not NULL:
460          if isinstance(element, _Element):
461              (<_Element>element)._c_node.line = (
462                  <unsigned short>c_ctxt.input.line
463                  if c_ctxt.input.line < 65535 else 65535)
464      return element
465  
466  
467  cdef int _pushSaxStartEvent(_SaxParserContext context,
468                              xmlparser.xmlParserCtxt* c_ctxt,
469                              const_xmlChar* c_href,
470                              const_xmlChar* c_name, node) except -1:
471      if (context._matcher is None or
472              context._matcher.matchesNsTag(c_href, c_name)):
473          if node is None and context._target is None:
474              assert context._doc is not None
475              node = _elementFactory(context._doc, c_ctxt.node)
476          if context._event_filter & PARSE_EVENT_FILTER_START:
477              context.events_iterator._events.append(('start', node))
478          if (context._target is None and
479                  context._event_filter & PARSE_EVENT_FILTER_END):
480              context._node_stack.append(node)
481      return 0
482  
483  
484  cdef void _handleSaxEnd(void* ctxt, const_xmlChar* c_localname,
485                          const_xmlChar* c_prefix,
486                          const_xmlChar* c_namespace) with gil:
487      c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
488      if c_ctxt._private is NULL or c_ctxt.disableSAX:
489          return
490      context = <_SaxParserContext>c_ctxt._private
491      try:
492          if context._target is not None:
493              if context._target._sax_event_filter & SAX_EVENT_END:
494                  node = context._target._handleSaxEnd(
495                      _namespacedNameFromNsName(c_namespace, c_localname))
496              else:
497                  node = None
498          else:
499              context._origSaxEnd(c_ctxt, c_localname, c_prefix, c_namespace)
500              node = None
501          _pushSaxEndEvent(context, c_namespace, c_localname, node)
502          _pushSaxNsEndEvents(context)
503      except:
504          context._handleSaxException(c_ctxt)
505      finally:
506          return  # swallow any further exceptions
507  
508  
509  cdef void _handleSaxEndNoNs(void* ctxt, const_xmlChar* c_name) with gil:
510      c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
511      if c_ctxt._private is NULL or c_ctxt.disableSAX:
512          return
513      context = <_SaxParserContext>c_ctxt._private
514      try:
515          if context._target is not None:
516              node = context._target._handleSaxEnd(funicode(c_name))
517          else:
518              context._origSaxEndNoNs(c_ctxt, c_name)
519              node = None
520          _pushSaxEndEvent(context, NULL, c_name, node)
521      except:
522          context._handleSaxException(c_ctxt)
523      finally:
524          return  # swallow any further exceptions
525  
526  
527  cdef int _pushSaxNsEndEvents(_SaxParserContext context) except -1:
528      cdef bint build_events = context._event_filter & PARSE_EVENT_FILTER_END_NS
529      cdef bint call_target = (
530          context._target is not None
531          and context._target._sax_event_filter & SAX_EVENT_END_NS)
532      if not build_events and not call_target:
533          return 0
534  
535      cdef list declared_namespaces = context._ns_stack.pop()
536      if declared_namespaces is None:
537          return 0
538  
539      cdef tuple prefix_uri
540      for prefix_uri in reversed(declared_namespaces):
541          if call_target:
542              context._target._handleSaxEndNs(prefix_uri[0])
543          if build_events:
544              context.events_iterator._events.append(('end-ns', None))
545  
546      return 0
547  
548  
549  cdef int _pushSaxEndEvent(_SaxParserContext context,
550                            const_xmlChar* c_href,
551                            const_xmlChar* c_name, node) except -1:
552      if context._event_filter & PARSE_EVENT_FILTER_END:
553          if (context._matcher is None or
554                  context._matcher.matchesNsTag(c_href, c_name)):
555              if context._target is None:
556                  node = context._node_stack.pop()
557              context.events_iterator._events.append(('end', node))
558      return 0
559  
560  
561  cdef void _handleSaxData(void* ctxt, const_xmlChar* c_data, int data_len) with gil:
562      # can only be called if parsing with a target
563      c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
564      if c_ctxt._private is NULL or c_ctxt.disableSAX:
565          return
566      context = <_SaxParserContext>c_ctxt._private
567      try:
568          context._target._handleSaxData(
569              c_data[:data_len].decode('utf8'))
570      except:
571          context._handleSaxException(c_ctxt)
572      finally:
573          return  # swallow any further exceptions
574  
575  
576  cdef void _handleSaxTargetDoctype(void* ctxt, const_xmlChar* c_name,
577                                    const_xmlChar* c_public,
578                                    const_xmlChar* c_system) with gil:
579      # can only be called if parsing with a target
580      c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
581      if c_ctxt._private is NULL or c_ctxt.disableSAX:
582          return
583      context = <_SaxParserContext>c_ctxt._private
584      try:
585          context._target._handleSaxDoctype(
586              funicodeOrNone(c_name),
587              funicodeOrNone(c_public),
588              funicodeOrNone(c_system))
589      except:
590          context._handleSaxException(c_ctxt)
591      finally:
592          return  # swallow any further exceptions
593  
594  
595  cdef void _handleSaxStartDocument(void* ctxt) with gil:
596      c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
597      if c_ctxt._private is NULL or c_ctxt.disableSAX:
598          return
599      context = <_SaxParserContext>c_ctxt._private
600      context._origSaxStartDocument(ctxt)
601      c_doc = c_ctxt.myDoc
602      try:
603          context.startDocument(c_doc)
604      except:
605          context._handleSaxException(c_ctxt)
606      finally:
607          return  # swallow any further exceptions
608  
609  
610  cdef void _handleSaxTargetPI(void* ctxt, const_xmlChar* c_target,
611                               const_xmlChar* c_data) with gil:
612      # can only be called if parsing with a target
613      c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
614      if c_ctxt._private is NULL or c_ctxt.disableSAX:
615          return
616      context = <_SaxParserContext>c_ctxt._private
617      try:
618          pi = context._target._handleSaxPi(
619              funicodeOrNone(c_target),
620              funicodeOrEmpty(c_data))
621          if context._event_filter & PARSE_EVENT_FILTER_PI:
622              context.events_iterator._events.append(('pi', pi))
623      except:
624          context._handleSaxException(c_ctxt)
625      finally:
626          return  # swallow any further exceptions
627  
628  
629  cdef void _handleSaxPIEvent(void* ctxt, const_xmlChar* target,
630                              const_xmlChar* data) with gil:
631      # can only be called when collecting pi events
632      c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
633      if c_ctxt._private is NULL or c_ctxt.disableSAX:
634          return
635      context = <_SaxParserContext>c_ctxt._private
636      context._origSaxPI(ctxt, target, data)
637      c_node = _findLastEventNode(c_ctxt)
638      if c_node is NULL:
639          return
640      try:
641          context.pushEvent('pi', c_node)
642      except:
643          context._handleSaxException(c_ctxt)
644      finally:
645          return  # swallow any further exceptions
646  
647  
648  cdef void _handleSaxTargetComment(void* ctxt, const_xmlChar* c_data) with gil:
649      # can only be called if parsing with a target
650      c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
651      if c_ctxt._private is NULL or c_ctxt.disableSAX:
652          return
653      context = <_SaxParserContext>c_ctxt._private
654      try:
655          comment = context._target._handleSaxComment(funicodeOrEmpty(c_data))
656          if context._event_filter & PARSE_EVENT_FILTER_COMMENT:
657              context.events_iterator._events.append(('comment', comment))
658      except:
659          context._handleSaxException(c_ctxt)
660      finally:
661          return  # swallow any further exceptions
662  
663  
664  cdef void _handleSaxComment(void* ctxt, const_xmlChar* text) with gil:
665      # can only be called when collecting comment events
666      c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
667      if c_ctxt._private is NULL or c_ctxt.disableSAX:
668          return
669      context = <_SaxParserContext>c_ctxt._private
670      context._origSaxComment(ctxt, text)
671      c_node = _findLastEventNode(c_ctxt)
672      if c_node is NULL:
673          return
674      try:
675          context.pushEvent('comment', c_node)
676      except:
677          context._handleSaxException(c_ctxt)
678      finally:
679          return  # swallow any further exceptions
680  
681  
682  cdef inline xmlNode* _findLastEventNode(xmlparser.xmlParserCtxt* c_ctxt):
683      # this mimics what libxml2 creates for comments/PIs
684      if c_ctxt.inSubset == 1:
685          return c_ctxt.myDoc.intSubset.last
686      elif c_ctxt.inSubset == 2:
687          return c_ctxt.myDoc.extSubset.last
688      elif c_ctxt.node is NULL:
689          return c_ctxt.myDoc.last
690      elif c_ctxt.node.type == tree.XML_ELEMENT_NODE:
691          return c_ctxt.node.last
692      else:
693          return c_ctxt.node.next
694  
695  
696  ############################################################
697  ## ET compatible XML tree builder
698  ############################################################
699  
700  cdef class TreeBuilder(_SaxParserTarget):
701      u"""TreeBuilder(self, element_factory=None, parser=None,
702                      comment_factory=None, pi_factory=None,
703                      insert_comments=True, insert_pis=True)
704  
705      Parser target that builds a tree from parse event callbacks.
706  
707      The factory arguments can be used to influence the creation of
708      elements, comments and processing instructions.
709  
710      By default, comments and processing instructions are inserted into
711      the tree, but they can be ignored by passing the respective flags.
712  
713      The final tree is returned by the ``close()`` method.
714      """
715      cdef _BaseParser _parser
716      cdef object _factory
717      cdef object _comment_factory
718      cdef object _pi_factory
719      cdef list _data
720      cdef list _element_stack
721      cdef object _element_stack_pop
722      cdef _Element _last # may be None
723      cdef bint _in_tail
724      cdef bint _insert_comments
725      cdef bint _insert_pis
726  
727      def __init__(self, *, element_factory=None, parser=None,
728                   comment_factory=None, pi_factory=None,
729                   bint insert_comments=True, bint insert_pis=True):
730          self._sax_event_filter = \
731              SAX_EVENT_START | SAX_EVENT_END | SAX_EVENT_DATA | \
732              SAX_EVENT_PI | SAX_EVENT_COMMENT
733          self._data = [] # data collector
734          self._element_stack = [] # element stack
735          self._element_stack_pop = self._element_stack.pop
736          self._last = None # last element
737          self._in_tail = 0 # true if we're after an end tag
738          self._factory = element_factory
739          self._comment_factory = comment_factory if comment_factory is not None else Comment
740          self._pi_factory = pi_factory if pi_factory is not None else ProcessingInstruction
741          self._insert_comments = insert_comments
742          self._insert_pis = insert_pis
743          self._parser = parser
744  
745      @cython.final
746      cdef int _flush(self) except -1:
747          if self._data:
748              if self._last is not None:
749                  text = u"".join(self._data)
750                  if self._in_tail:
751                      assert self._last.tail is None, u"internal error (tail)"
752                      self._last.tail = text
753                  else:
754                      assert self._last.text is None, u"internal error (text)"
755                      self._last.text = text
756              del self._data[:]
757          return 0
758  
759      # internal SAX event handlers
760  
761      @cython.final
762      cdef _handleSaxStart(self, tag, attrib, nsmap):
763          self._flush()
764          if self._factory is not None:
765              self._last = self._factory(tag, attrib)
766              if self._element_stack:
767                  _appendChild(self._element_stack[-1], self._last)
768          elif self._element_stack:
769              self._last = _makeSubElement(
770                  self._element_stack[-1], tag, None, None, attrib, nsmap, None)
771          else:
772              self._last = _makeElement(
773                  tag, NULL, None, self._parser, None, None, attrib, nsmap, None)
774          self._element_stack.append(self._last)
775          self._in_tail = 0
776          return self._last
777  
778      @cython.final
779      cdef _handleSaxEnd(self, tag):
780          self._flush()
781          self._last = self._element_stack_pop()
782          self._in_tail = 1
783          return self._last
784  
785      @cython.final
786      cdef int _handleSaxData(self, data) except -1:
787          self._data.append(data)
788  
789      @cython.final
790      cdef _handleSaxPi(self, target, data):
791          elem = self._pi_factory(target, data)
792          if self._insert_pis:
793              self._flush()
794              self._last = elem
795              if self._element_stack:
796                  _appendChild(self._element_stack[-1], self._last)
797              self._in_tail = 1
798          return self._last
799  
800      @cython.final
801      cdef _handleSaxComment(self, comment):
802          elem = self._comment_factory(comment)
803          if self._insert_comments:
804              self._flush()
805              self._last = elem
806              if self._element_stack:
807                  _appendChild(self._element_stack[-1], self._last)
808              self._in_tail = 1
809          return elem
810  
811      # Python level event handlers
812  
813      def close(self):
814          u"""close(self)
815  
816          Flushes the builder buffers, and returns the toplevel document
817          element.  Raises XMLSyntaxError on inconsistencies.
818          """
819          if self._element_stack:
820              raise XMLSyntaxAssertionError("missing end tags")
821          # TODO: this does not necessarily seem like an error case.  Why not just return None?
822          if self._last is None:
823              raise XMLSyntaxAssertionError("missing toplevel element")
824          return self._last
825  
826      def data(self, data):
827          u"""data(self, data)
828  
829          Adds text to the current element.  The value should be either an
830          8-bit string containing ASCII text, or a Unicode string.
831          """
832          self._handleSaxData(data)
833  
834      def start(self, tag, attrs, nsmap=None):
835          u"""start(self, tag, attrs, nsmap=None)
836  
837          Opens a new element.
838          """
839          if nsmap is None:
840              nsmap = IMMUTABLE_EMPTY_MAPPING
841          return self._handleSaxStart(tag, attrs, nsmap)
842  
843      def end(self, tag):
844          u"""end(self, tag)
845  
846          Closes the current element.
847          """
848          element = self._handleSaxEnd(tag)
849          assert self._last.tag == tag,\
850              f"end tag mismatch (expected {self._last.tag}, got {tag})"
851          return element
852  
853      def pi(self, target, data=None):
854          u"""pi(self, target, data=None)
855  
856          Creates a processing instruction using the factory, appends it
857          (unless disabled) and returns it.
858          """
859          return self._handleSaxPi(target, data)
860  
861      def comment(self, comment):
862          u"""comment(self, comment)
863  
864          Creates a comment using the factory, appends it (unless disabled)
865          and returns it.
866          """
867          return self._handleSaxComment(comment)