iterparse.pxi
1 # iterparse -- event-driven parsing 2 3 DEF __ITERPARSE_CHUNK_SIZE = 32768 4 5 cdef class iterparse: 6 u"""iterparse(self, source, events=("end",), tag=None, \ 7 attribute_defaults=False, dtd_validation=False, \ 8 load_dtd=False, no_network=True, remove_blank_text=False, \ 9 remove_comments=False, remove_pis=False, encoding=None, \ 10 html=False, recover=None, huge_tree=False, schema=None) 11 12 Incremental parser. 13 14 Parses XML into a tree and generates tuples (event, element) in a 15 SAX-like fashion. ``event`` is any of 'start', 'end', 'start-ns', 16 'end-ns'. 17 18 For 'start' and 'end', ``element`` is the Element that the parser just 19 found opening or closing. For 'start-ns', it is a tuple (prefix, URI) of 20 a new namespace declaration. For 'end-ns', it is simply None. Note that 21 all start and end events are guaranteed to be properly nested. 22 23 The keyword argument ``events`` specifies a sequence of event type names 24 that should be generated. By default, only 'end' events will be 25 generated. 26 27 The additional ``tag`` argument restricts the 'start' and 'end' events to 28 those elements that match the given tag. The ``tag`` argument can also be 29 a sequence of tags to allow matching more than one tag. By default, 30 events are generated for all elements. Note that the 'start-ns' and 31 'end-ns' events are not impacted by this restriction. 32 33 The other keyword arguments in the constructor are mainly based on the 34 libxml2 parser configuration. A DTD will also be loaded if validation or 35 attribute default values are requested. 36 37 Available boolean keyword arguments: 38 - attribute_defaults: read default attributes from DTD 39 - dtd_validation: validate (if DTD is available) 40 - load_dtd: use DTD for parsing 41 - no_network: prevent network access for related files 42 - remove_blank_text: discard blank text nodes 43 - remove_comments: discard comments 44 - remove_pis: discard processing instructions 45 - strip_cdata: replace CDATA sections by normal text content (default: True) 46 - compact: safe memory for short text content (default: True) 47 - resolve_entities: replace entities by their text value (default: True) 48 - huge_tree: disable security restrictions and support very deep trees 49 and very long text content (only affects libxml2 2.7+) 50 - html: parse input as HTML (default: XML) 51 - recover: try hard to parse through broken input (default: True for HTML, 52 False otherwise) 53 54 Other keyword arguments: 55 - encoding: override the document encoding 56 - schema: an XMLSchema to validate against 57 """ 58 cdef _FeedParser _parser 59 cdef object _tag 60 cdef object _events 61 cdef readonly object root 62 cdef object _source 63 cdef object _filename 64 cdef object _error 65 cdef bint _close_source_after_read 66 67 def __init__(self, source, events=(u"end",), *, tag=None, 68 attribute_defaults=False, dtd_validation=False, 69 load_dtd=False, no_network=True, remove_blank_text=False, 70 compact=True, resolve_entities=True, remove_comments=False, 71 remove_pis=False, strip_cdata=True, encoding=None, 72 html=False, recover=None, huge_tree=False, collect_ids=True, 73 XMLSchema schema=None): 74 if not hasattr(source, 'read'): 75 source = _getFSPathOrObject(source) 76 self._filename = source 77 if python.IS_PYTHON2: 78 source = _encodeFilename(source) 79 source = open(source, 'rb') 80 self._close_source_after_read = True 81 else: 82 self._filename = _getFilenameForFile(source) 83 self._close_source_after_read = False 84 85 if recover is None: 86 recover = html 87 88 if html: 89 # make sure we're not looking for namespaces 90 events = [event for event in events 91 if event not in ('start-ns', 'end-ns')] 92 parser = HTMLPullParser( 93 events, 94 tag=tag, 95 recover=recover, 96 base_url=self._filename, 97 encoding=encoding, 98 remove_blank_text=remove_blank_text, 99 remove_comments=remove_comments, 100 remove_pis=remove_pis, 101 strip_cdata=strip_cdata, 102 no_network=no_network, 103 target=None, # TODO 104 schema=schema, 105 compact=compact) 106 else: 107 parser = XMLPullParser( 108 events, 109 tag=tag, 110 recover=recover, 111 base_url=self._filename, 112 encoding=encoding, 113 attribute_defaults=attribute_defaults, 114 dtd_validation=dtd_validation, 115 load_dtd=load_dtd, 116 no_network=no_network, 117 schema=schema, 118 huge_tree=huge_tree, 119 remove_blank_text=remove_blank_text, 120 resolve_entities=resolve_entities, 121 remove_comments=remove_comments, 122 remove_pis=remove_pis, 123 strip_cdata=strip_cdata, 124 collect_ids=True, 125 target=None, # TODO 126 compact=compact) 127 128 self._events = parser.read_events() 129 self._parser = parser 130 self._source = source 131 132 @property 133 def error_log(self): 134 """The error log of the last (or current) parser run. 135 """ 136 return self._parser.feed_error_log 137 138 @property 139 def resolvers(self): 140 """The custom resolver registry of the last (or current) parser run. 141 """ 142 return self._parser.resolvers 143 144 @property 145 def version(self): 146 """The version of the underlying XML parser.""" 147 return self._parser.version 148 149 def set_element_class_lookup(self, ElementClassLookup lookup = None): 150 u"""set_element_class_lookup(self, lookup = None) 151 152 Set a lookup scheme for element classes generated from this parser. 153 154 Reset it by passing None or nothing. 155 """ 156 self._parser.set_element_class_lookup(lookup) 157 158 def makeelement(self, _tag, attrib=None, nsmap=None, **_extra): 159 u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra) 160 161 Creates a new element associated with this parser. 162 """ 163 self._parser.makeelement( 164 _tag, attrib=None, nsmap=None, **_extra) 165 166 @cython.final 167 cdef _close_source(self): 168 if self._source is None: 169 return 170 if not self._close_source_after_read: 171 self._source = None 172 return 173 try: 174 close = self._source.close 175 except AttributeError: 176 close = None 177 finally: 178 self._source = None 179 if close is not None: 180 close() 181 182 def __iter__(self): 183 return self 184 185 def __next__(self): 186 try: 187 return next(self._events) 188 except StopIteration: 189 pass 190 context = <_SaxParserContext>self._parser._getPushParserContext() 191 if self._source is not None: 192 done = False 193 while not done: 194 try: 195 done = self._read_more_events(context) 196 return next(self._events) 197 except StopIteration: 198 pass # no events yet 199 except Exception as e: 200 self._error = e 201 self._close_source() 202 try: 203 return next(self._events) 204 except StopIteration: 205 break 206 # nothing left to read or return 207 if self._error is not None: 208 error = self._error 209 self._error = None 210 raise error 211 if (context._validator is not None 212 and not context._validator.isvalid()): 213 _raiseParseError(context._c_ctxt, self._filename, 214 context._error_log) 215 # no errors => all done 216 raise StopIteration 217 218 @cython.final 219 cdef bint _read_more_events(self, _SaxParserContext context) except -123: 220 data = self._source.read(__ITERPARSE_CHUNK_SIZE) 221 if not isinstance(data, bytes): 222 self._close_source() 223 raise TypeError("reading file objects must return bytes objects") 224 if not data: 225 try: 226 self.root = self._parser.close() 227 finally: 228 self._close_source() 229 return True 230 self._parser.feed(data) 231 return False 232 233 234 cdef enum _IterwalkSkipStates: 235 IWSKIP_NEXT_IS_START 236 IWSKIP_SKIP_NEXT 237 IWSKIP_CAN_SKIP 238 IWSKIP_CANNOT_SKIP 239 240 241 cdef class iterwalk: 242 u"""iterwalk(self, element_or_tree, events=("end",), tag=None) 243 244 A tree walker that generates events from an existing tree as if it 245 was parsing XML data with ``iterparse()``. 246 247 Just as for ``iterparse()``, the ``tag`` argument can be a single tag or a 248 sequence of tags. 249 250 After receiving a 'start' or 'start-ns' event, the children and 251 descendants of the current element can be excluded from iteration 252 by calling the ``skip_subtree()`` method. 253 """ 254 cdef _MultiTagMatcher _matcher 255 cdef list _node_stack 256 cdef list _events 257 cdef object _pop_event 258 cdef object _include_siblings 259 cdef int _index 260 cdef int _event_filter 261 cdef _IterwalkSkipStates _skip_state 262 263 def __init__(self, element_or_tree, events=(u"end",), tag=None): 264 cdef _Element root 265 cdef int ns_count 266 root = _rootNodeOrRaise(element_or_tree) 267 self._event_filter = _buildParseEventFilter(events) 268 if tag is None or tag == '*': 269 self._matcher = None 270 else: 271 self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag) 272 self._node_stack = [] 273 self._events = [] 274 self._pop_event = self._events.pop 275 self._skip_state = IWSKIP_CANNOT_SKIP # ignore all skip requests by default 276 277 if self._event_filter: 278 self._index = 0 279 if self._matcher is not None and self._event_filter & PARSE_EVENT_FILTER_START: 280 self._matcher.cacheTags(root._doc) 281 282 # When processing an ElementTree, add events for the preceding comments/PIs. 283 if self._event_filter & (PARSE_EVENT_FILTER_COMMENT | PARSE_EVENT_FILTER_PI): 284 if isinstance(element_or_tree, _ElementTree): 285 self._include_siblings = root 286 for elem in list(root.itersiblings(preceding=True))[::-1]: 287 if self._event_filter & PARSE_EVENT_FILTER_COMMENT and elem.tag is Comment: 288 self._events.append((u'comment', elem)) 289 elif self._event_filter & PARSE_EVENT_FILTER_PI and elem.tag is PI: 290 self._events.append((u'pi', elem)) 291 292 ns_count = self._start_node(root) 293 self._node_stack.append( (root, ns_count) ) 294 else: 295 self._index = -1 296 297 def __iter__(self): 298 return self 299 300 def __next__(self): 301 cdef xmlNode* c_child 302 cdef _Element node 303 cdef _Element next_node 304 cdef int ns_count = 0 305 if self._events: 306 return self._next_event() 307 if self._matcher is not None and self._index >= 0: 308 node = self._node_stack[self._index][0] 309 self._matcher.cacheTags(node._doc) 310 311 # find next node 312 while self._index >= 0: 313 node = self._node_stack[self._index][0] 314 315 if self._skip_state == IWSKIP_SKIP_NEXT: 316 c_child = NULL 317 else: 318 c_child = self._process_non_elements( 319 node._doc, _findChildForwards(node._c_node, 0)) 320 self._skip_state = IWSKIP_CANNOT_SKIP 321 322 while c_child is NULL: 323 # back off through parents 324 self._index -= 1 325 node = self._end_node() 326 if self._index < 0: 327 break 328 c_child = self._process_non_elements( 329 node._doc, _nextElement(node._c_node)) 330 331 if c_child is not NULL: 332 next_node = _elementFactory(node._doc, c_child) 333 if self._event_filter & (PARSE_EVENT_FILTER_START | 334 PARSE_EVENT_FILTER_START_NS): 335 ns_count = self._start_node(next_node) 336 elif self._event_filter & PARSE_EVENT_FILTER_END_NS: 337 ns_count = _countNsDefs(next_node._c_node) 338 self._node_stack.append( (next_node, ns_count) ) 339 self._index += 1 340 if self._events: 341 return self._next_event() 342 343 if self._include_siblings is not None: 344 node, self._include_siblings = self._include_siblings, None 345 self._process_non_elements(node._doc, _nextElement(node._c_node)) 346 if self._events: 347 return self._next_event() 348 349 raise StopIteration 350 351 @cython.final 352 cdef xmlNode* _process_non_elements(self, _Document doc, xmlNode* c_node): 353 while c_node is not NULL and c_node.type != tree.XML_ELEMENT_NODE: 354 if c_node.type == tree.XML_COMMENT_NODE: 355 if self._event_filter & PARSE_EVENT_FILTER_COMMENT: 356 self._events.append( 357 (u"comment", _elementFactory(doc, c_node))) 358 c_node = _nextElement(c_node) 359 elif c_node.type == tree.XML_PI_NODE: 360 if self._event_filter & PARSE_EVENT_FILTER_PI: 361 self._events.append( 362 (u"pi", _elementFactory(doc, c_node))) 363 c_node = _nextElement(c_node) 364 else: 365 break 366 return c_node 367 368 @cython.final 369 cdef _next_event(self): 370 if self._skip_state == IWSKIP_NEXT_IS_START: 371 if self._events[0][0] in (u'start', u'start-ns'): 372 self._skip_state = IWSKIP_CAN_SKIP 373 return self._pop_event(0) 374 375 def skip_subtree(self): 376 """Prevent descending into the current subtree. 377 Instead, the next returned event will be the 'end' event of the current element 378 (if included), ignoring any children or descendants. 379 380 This has no effect right after an 'end' or 'end-ns' event. 381 """ 382 if self._skip_state == IWSKIP_CAN_SKIP: 383 self._skip_state = IWSKIP_SKIP_NEXT 384 385 @cython.final 386 cdef int _start_node(self, _Element node) except -1: 387 cdef int ns_count 388 if self._event_filter & PARSE_EVENT_FILTER_START_NS: 389 ns_count = _appendStartNsEvents(node._c_node, self._events) 390 if self._events: 391 self._skip_state = IWSKIP_NEXT_IS_START 392 elif self._event_filter & PARSE_EVENT_FILTER_END_NS: 393 ns_count = _countNsDefs(node._c_node) 394 else: 395 ns_count = 0 396 if self._event_filter & PARSE_EVENT_FILTER_START: 397 if self._matcher is None or self._matcher.matches(node._c_node): 398 self._events.append( (u"start", node) ) 399 self._skip_state = IWSKIP_NEXT_IS_START 400 return ns_count 401 402 @cython.final 403 cdef _Element _end_node(self): 404 cdef _Element node 405 cdef int i, ns_count 406 node, ns_count = self._node_stack.pop() 407 if self._event_filter & PARSE_EVENT_FILTER_END: 408 if self._matcher is None or self._matcher.matches(node._c_node): 409 self._events.append( (u"end", node) ) 410 if self._event_filter & PARSE_EVENT_FILTER_END_NS and ns_count: 411 event = (u"end-ns", None) 412 for i in range(ns_count): 413 self._events.append(event) 414 return node 415 416 417 cdef int _countNsDefs(xmlNode* c_node): 418 cdef xmlNs* c_ns 419 cdef int count 420 count = 0 421 c_ns = c_node.nsDef 422 while c_ns is not NULL: 423 count += (c_ns.href is not NULL) 424 c_ns = c_ns.next 425 return count 426 427 428 cdef int _appendStartNsEvents(xmlNode* c_node, list event_list) except -1: 429 cdef xmlNs* c_ns 430 cdef int count 431 count = 0 432 c_ns = c_node.nsDef 433 while c_ns is not NULL: 434 if c_ns.href: 435 ns_tuple = (funicodeOrEmpty(c_ns.prefix), 436 funicode(c_ns.href)) 437 event_list.append( (u"start-ns", ns_tuple) ) 438 count += 1 439 c_ns = c_ns.next 440 return count