/ lib / lxml / iterparse.pxi
iterparse.pxi
  1  # iterparse -- event-driven parsing
  2  
  3  DEF __ITERPARSE_CHUNK_SIZE = 32768
  4  
  5  cdef class iterparse:
  6      u"""iterparse(self, source, events=("end",), tag=None, \
  7                    attribute_defaults=False, dtd_validation=False, \
  8                    load_dtd=False, no_network=True, remove_blank_text=False, \
  9                    remove_comments=False, remove_pis=False, encoding=None, \
 10                    html=False, recover=None, huge_tree=False, schema=None)
 11  
 12      Incremental parser.
 13  
 14      Parses XML into a tree and generates tuples (event, element) in a
 15      SAX-like fashion. ``event`` is any of 'start', 'end', 'start-ns',
 16      'end-ns'.
 17  
 18      For 'start' and 'end', ``element`` is the Element that the parser just
 19      found opening or closing.  For 'start-ns', it is a tuple (prefix, URI) of
 20      a new namespace declaration.  For 'end-ns', it is simply None.  Note that
 21      all start and end events are guaranteed to be properly nested.
 22  
 23      The keyword argument ``events`` specifies a sequence of event type names
 24      that should be generated.  By default, only 'end' events will be
 25      generated.
 26  
 27      The additional ``tag`` argument restricts the 'start' and 'end' events to
 28      those elements that match the given tag.  The ``tag`` argument can also be
 29      a sequence of tags to allow matching more than one tag.  By default,
 30      events are generated for all elements.  Note that the 'start-ns' and
 31      'end-ns' events are not impacted by this restriction.
 32  
 33      The other keyword arguments in the constructor are mainly based on the
 34      libxml2 parser configuration.  A DTD will also be loaded if validation or
 35      attribute default values are requested.
 36  
 37      Available boolean keyword arguments:
 38       - attribute_defaults: read default attributes from DTD
 39       - dtd_validation: validate (if DTD is available)
 40       - load_dtd: use DTD for parsing
 41       - no_network: prevent network access for related files
 42       - remove_blank_text: discard blank text nodes
 43       - remove_comments: discard comments
 44       - remove_pis: discard processing instructions
 45       - strip_cdata: replace CDATA sections by normal text content (default: True)
 46       - compact: safe memory for short text content (default: True)
 47       - resolve_entities: replace entities by their text value (default: True)
 48       - huge_tree: disable security restrictions and support very deep trees
 49                    and very long text content (only affects libxml2 2.7+)
 50       - html: parse input as HTML (default: XML)
 51       - recover: try hard to parse through broken input (default: True for HTML,
 52                  False otherwise)
 53  
 54      Other keyword arguments:
 55       - encoding: override the document encoding
 56       - schema: an XMLSchema to validate against
 57      """
 58      cdef _FeedParser _parser
 59      cdef object _tag
 60      cdef object _events
 61      cdef readonly object root
 62      cdef object _source
 63      cdef object _filename
 64      cdef object _error
 65      cdef bint _close_source_after_read
 66  
 67      def __init__(self, source, events=(u"end",), *, tag=None,
 68                   attribute_defaults=False, dtd_validation=False,
 69                   load_dtd=False, no_network=True, remove_blank_text=False,
 70                   compact=True, resolve_entities=True, remove_comments=False,
 71                   remove_pis=False, strip_cdata=True, encoding=None,
 72                   html=False, recover=None, huge_tree=False, collect_ids=True,
 73                   XMLSchema schema=None):
 74          if not hasattr(source, 'read'):
 75              source = _getFSPathOrObject(source)
 76              self._filename = source
 77              if python.IS_PYTHON2:
 78                  source = _encodeFilename(source)
 79              source = open(source, 'rb')
 80              self._close_source_after_read = True
 81          else:
 82              self._filename = _getFilenameForFile(source)
 83              self._close_source_after_read = False
 84  
 85          if recover is None:
 86              recover = html
 87  
 88          if html:
 89              # make sure we're not looking for namespaces
 90              events = [event for event in events
 91                        if event not in ('start-ns', 'end-ns')]
 92              parser = HTMLPullParser(
 93                  events,
 94                  tag=tag,
 95                  recover=recover,
 96                  base_url=self._filename,
 97                  encoding=encoding,
 98                  remove_blank_text=remove_blank_text,
 99                  remove_comments=remove_comments,
100                  remove_pis=remove_pis,
101                  strip_cdata=strip_cdata,
102                  no_network=no_network,
103                  target=None,  # TODO
104                  schema=schema,
105                  compact=compact)
106          else:
107              parser = XMLPullParser(
108                  events,
109                  tag=tag,
110                  recover=recover,
111                  base_url=self._filename,
112                  encoding=encoding,
113                  attribute_defaults=attribute_defaults,
114                  dtd_validation=dtd_validation,
115                  load_dtd=load_dtd,
116                  no_network=no_network,
117                  schema=schema,
118                  huge_tree=huge_tree,
119                  remove_blank_text=remove_blank_text,
120                  resolve_entities=resolve_entities,
121                  remove_comments=remove_comments,
122                  remove_pis=remove_pis,
123                  strip_cdata=strip_cdata,
124                  collect_ids=True,
125                  target=None,  # TODO
126                  compact=compact)
127  
128          self._events = parser.read_events()
129          self._parser = parser
130          self._source = source
131  
132      @property
133      def error_log(self):
134          """The error log of the last (or current) parser run.
135          """
136          return self._parser.feed_error_log
137  
138      @property
139      def resolvers(self):
140          """The custom resolver registry of the last (or current) parser run.
141          """
142          return self._parser.resolvers
143  
144      @property
145      def version(self):
146          """The version of the underlying XML parser."""
147          return self._parser.version
148  
149      def set_element_class_lookup(self, ElementClassLookup lookup = None):
150          u"""set_element_class_lookup(self, lookup = None)
151  
152          Set a lookup scheme for element classes generated from this parser.
153  
154          Reset it by passing None or nothing.
155          """
156          self._parser.set_element_class_lookup(lookup)
157  
158      def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
159          u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
160  
161          Creates a new element associated with this parser.
162          """
163          self._parser.makeelement(
164              _tag, attrib=None, nsmap=None, **_extra)
165  
166      @cython.final
167      cdef _close_source(self):
168          if self._source is None:
169              return
170          if not self._close_source_after_read:
171              self._source = None
172              return
173          try:
174              close = self._source.close
175          except AttributeError:
176              close = None
177          finally:
178              self._source = None
179          if close is not None:
180              close()
181  
182      def __iter__(self):
183          return self
184  
185      def __next__(self):
186          try:
187              return next(self._events)
188          except StopIteration:
189              pass
190          context = <_SaxParserContext>self._parser._getPushParserContext()
191          if self._source is not None:
192              done = False
193              while not done:
194                  try:
195                      done = self._read_more_events(context)
196                      return next(self._events)
197                  except StopIteration:
198                      pass  # no events yet
199                  except Exception as e:
200                      self._error = e
201                      self._close_source()
202                      try:
203                          return next(self._events)
204                      except StopIteration:
205                          break
206          # nothing left to read or return
207          if self._error is not None:
208              error = self._error
209              self._error = None
210              raise error
211          if (context._validator is not None
212                  and not context._validator.isvalid()):
213              _raiseParseError(context._c_ctxt, self._filename,
214                               context._error_log)
215          # no errors => all done
216          raise StopIteration
217  
218      @cython.final
219      cdef bint _read_more_events(self, _SaxParserContext context) except -123:
220          data = self._source.read(__ITERPARSE_CHUNK_SIZE)
221          if not isinstance(data, bytes):
222              self._close_source()
223              raise TypeError("reading file objects must return bytes objects")
224          if not data:
225              try:
226                  self.root = self._parser.close()
227              finally:
228                  self._close_source()
229              return True
230          self._parser.feed(data)
231          return False
232  
233  
234  cdef enum _IterwalkSkipStates:
235      IWSKIP_NEXT_IS_START
236      IWSKIP_SKIP_NEXT
237      IWSKIP_CAN_SKIP
238      IWSKIP_CANNOT_SKIP
239  
240  
241  cdef class iterwalk:
242      u"""iterwalk(self, element_or_tree, events=("end",), tag=None)
243  
244      A tree walker that generates events from an existing tree as if it
245      was parsing XML data with ``iterparse()``.
246  
247      Just as for ``iterparse()``, the ``tag`` argument can be a single tag or a
248      sequence of tags.
249  
250      After receiving a 'start' or 'start-ns' event, the children and
251      descendants of the current element can be excluded from iteration
252      by calling the ``skip_subtree()`` method.
253      """
254      cdef _MultiTagMatcher _matcher
255      cdef list   _node_stack
256      cdef list   _events
257      cdef object _pop_event
258      cdef object _include_siblings
259      cdef int    _index
260      cdef int    _event_filter
261      cdef _IterwalkSkipStates _skip_state
262  
263      def __init__(self, element_or_tree, events=(u"end",), tag=None):
264          cdef _Element root
265          cdef int ns_count
266          root = _rootNodeOrRaise(element_or_tree)
267          self._event_filter = _buildParseEventFilter(events)
268          if tag is None or tag == '*':
269              self._matcher = None
270          else:
271              self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag)
272          self._node_stack  = []
273          self._events = []
274          self._pop_event = self._events.pop
275          self._skip_state = IWSKIP_CANNOT_SKIP  # ignore all skip requests by default
276  
277          if self._event_filter:
278              self._index = 0
279              if self._matcher is not None and self._event_filter & PARSE_EVENT_FILTER_START:
280                  self._matcher.cacheTags(root._doc)
281  
282              # When processing an ElementTree, add events for the preceding comments/PIs.
283              if self._event_filter & (PARSE_EVENT_FILTER_COMMENT | PARSE_EVENT_FILTER_PI):
284                  if isinstance(element_or_tree, _ElementTree):
285                      self._include_siblings = root
286                      for elem in list(root.itersiblings(preceding=True))[::-1]:
287                          if self._event_filter & PARSE_EVENT_FILTER_COMMENT and elem.tag is Comment:
288                              self._events.append((u'comment', elem))
289                          elif self._event_filter & PARSE_EVENT_FILTER_PI and elem.tag is PI:
290                              self._events.append((u'pi', elem))
291  
292              ns_count = self._start_node(root)
293              self._node_stack.append( (root, ns_count) )
294          else:
295              self._index = -1
296  
297      def __iter__(self):
298          return self
299  
300      def __next__(self):
301          cdef xmlNode* c_child
302          cdef _Element node
303          cdef _Element next_node
304          cdef int ns_count = 0
305          if self._events:
306              return self._next_event()
307          if self._matcher is not None and self._index >= 0:
308              node = self._node_stack[self._index][0]
309              self._matcher.cacheTags(node._doc)
310  
311          # find next node
312          while self._index >= 0:
313              node = self._node_stack[self._index][0]
314  
315              if self._skip_state == IWSKIP_SKIP_NEXT:
316                  c_child = NULL
317              else:
318                  c_child = self._process_non_elements(
319                      node._doc, _findChildForwards(node._c_node, 0))
320              self._skip_state = IWSKIP_CANNOT_SKIP
321  
322              while c_child is NULL:
323                  # back off through parents
324                  self._index -= 1
325                  node = self._end_node()
326                  if self._index < 0:
327                      break
328                  c_child = self._process_non_elements(
329                      node._doc, _nextElement(node._c_node))
330  
331              if c_child is not NULL:
332                  next_node = _elementFactory(node._doc, c_child)
333                  if self._event_filter & (PARSE_EVENT_FILTER_START |
334                                           PARSE_EVENT_FILTER_START_NS):
335                      ns_count = self._start_node(next_node)
336                  elif self._event_filter & PARSE_EVENT_FILTER_END_NS:
337                      ns_count = _countNsDefs(next_node._c_node)
338                  self._node_stack.append( (next_node, ns_count) )
339                  self._index += 1
340              if self._events:
341                  return self._next_event()
342  
343          if self._include_siblings is not None:
344              node, self._include_siblings = self._include_siblings, None
345              self._process_non_elements(node._doc, _nextElement(node._c_node))
346              if self._events:
347                  return self._next_event()
348  
349          raise StopIteration
350  
351      @cython.final
352      cdef xmlNode* _process_non_elements(self, _Document doc, xmlNode* c_node):
353          while c_node is not NULL and c_node.type != tree.XML_ELEMENT_NODE:
354              if c_node.type == tree.XML_COMMENT_NODE:
355                  if self._event_filter & PARSE_EVENT_FILTER_COMMENT:
356                      self._events.append(
357                          (u"comment", _elementFactory(doc, c_node)))
358                  c_node = _nextElement(c_node)
359              elif c_node.type == tree.XML_PI_NODE:
360                  if self._event_filter & PARSE_EVENT_FILTER_PI:
361                      self._events.append(
362                          (u"pi", _elementFactory(doc, c_node)))
363                  c_node = _nextElement(c_node)
364              else:
365                  break
366          return c_node
367  
368      @cython.final
369      cdef _next_event(self):
370          if self._skip_state == IWSKIP_NEXT_IS_START:
371              if self._events[0][0] in (u'start', u'start-ns'):
372                  self._skip_state = IWSKIP_CAN_SKIP
373          return self._pop_event(0)
374  
375      def skip_subtree(self):
376          """Prevent descending into the current subtree.
377          Instead, the next returned event will be the 'end' event of the current element
378          (if included), ignoring any children or descendants.
379  
380          This has no effect right after an 'end' or 'end-ns' event.
381          """
382          if self._skip_state == IWSKIP_CAN_SKIP:
383              self._skip_state = IWSKIP_SKIP_NEXT
384  
385      @cython.final
386      cdef int _start_node(self, _Element node) except -1:
387          cdef int ns_count
388          if self._event_filter & PARSE_EVENT_FILTER_START_NS:
389              ns_count = _appendStartNsEvents(node._c_node, self._events)
390              if self._events:
391                  self._skip_state = IWSKIP_NEXT_IS_START
392          elif self._event_filter & PARSE_EVENT_FILTER_END_NS:
393              ns_count = _countNsDefs(node._c_node)
394          else:
395              ns_count = 0
396          if self._event_filter & PARSE_EVENT_FILTER_START:
397              if self._matcher is None or self._matcher.matches(node._c_node):
398                  self._events.append( (u"start", node) )
399                  self._skip_state = IWSKIP_NEXT_IS_START
400          return ns_count
401  
402      @cython.final
403      cdef _Element _end_node(self):
404          cdef _Element node
405          cdef int i, ns_count
406          node, ns_count = self._node_stack.pop()
407          if self._event_filter & PARSE_EVENT_FILTER_END:
408              if self._matcher is None or self._matcher.matches(node._c_node):
409                  self._events.append( (u"end", node) )
410          if self._event_filter & PARSE_EVENT_FILTER_END_NS and ns_count:
411              event = (u"end-ns", None)
412              for i in range(ns_count):
413                  self._events.append(event)
414          return node
415  
416  
417  cdef int _countNsDefs(xmlNode* c_node):
418      cdef xmlNs* c_ns
419      cdef int count
420      count = 0
421      c_ns = c_node.nsDef
422      while c_ns is not NULL:
423          count += (c_ns.href is not NULL)
424          c_ns = c_ns.next
425      return count
426  
427  
428  cdef int _appendStartNsEvents(xmlNode* c_node, list event_list) except -1:
429      cdef xmlNs* c_ns
430      cdef int count
431      count = 0
432      c_ns = c_node.nsDef
433      while c_ns is not NULL:
434          if c_ns.href:
435              ns_tuple = (funicodeOrEmpty(c_ns.prefix),
436                          funicode(c_ns.href))
437              event_list.append( (u"start-ns", ns_tuple) )
438              count += 1
439          c_ns = c_ns.next
440      return count