/ lib / lxml / extensions.pxi
extensions.pxi
  1  # support for extension functions in XPath and XSLT
  2  
  3  cdef class XPathError(LxmlError):
  4      """Base class of all XPath errors.
  5      """
  6  
  7  cdef class XPathEvalError(XPathError):
  8      """Error during XPath evaluation.
  9      """
 10  
 11  cdef class XPathFunctionError(XPathEvalError):
 12      """Internal error looking up an XPath extension function.
 13      """
 14  
 15  cdef class XPathResultError(XPathEvalError):
 16      """Error handling an XPath result.
 17      """
 18  
 19  
 20  # forward declarations
 21  
 22  ctypedef int (*_register_function)(void* ctxt, name_utf, ns_uri_utf)
 23  cdef class _ExsltRegExp
 24  
 25  ################################################################################
 26  # Base class for XSLT and XPath evaluation contexts: functions, namespaces, ...
 27  
 28  @cython.internal
 29  cdef class _BaseContext:
 30      cdef xpath.xmlXPathContext* _xpathCtxt
 31      cdef _Document _doc
 32      cdef dict _extensions
 33      cdef list _namespaces
 34      cdef list _global_namespaces
 35      cdef dict _utf_refs
 36      cdef dict _function_cache
 37      cdef dict _eval_context_dict
 38      cdef bint _build_smart_strings
 39      # for exception handling and temporary reference keeping:
 40      cdef _TempStore _temp_refs
 41      cdef set _temp_documents
 42      cdef _ExceptionContext _exc
 43      cdef _ErrorLog _error_log
 44  
 45      def __cinit__(self):
 46          self._xpathCtxt = NULL
 47  
 48      def __init__(self, namespaces, extensions, error_log, enable_regexp,
 49                   build_smart_strings):
 50          cdef _ExsltRegExp _regexp 
 51          cdef dict new_extensions
 52          cdef list ns
 53          self._utf_refs = {}
 54          self._global_namespaces = []
 55          self._function_cache = {}
 56          self._eval_context_dict = None
 57          self._error_log = error_log
 58  
 59          if extensions is not None:
 60              # convert extensions to UTF-8
 61              if isinstance(extensions, dict):
 62                  extensions = (extensions,)
 63              # format: [ {(ns, name):function} ] -> {(ns_utf, name_utf):function}
 64              new_extensions = {}
 65              for extension in extensions:
 66                  for (ns_uri, name), function in extension.items():
 67                      if name is None:
 68                          raise ValueError, u"extensions must have non empty names"
 69                      ns_utf   = self._to_utf(ns_uri)
 70                      name_utf = self._to_utf(name)
 71                      new_extensions[(ns_utf, name_utf)] = function
 72              extensions = new_extensions or None
 73  
 74          if namespaces is not None:
 75              if isinstance(namespaces, dict):
 76                  namespaces = namespaces.items()
 77              if namespaces:
 78                  ns = []
 79                  for prefix, ns_uri in namespaces:
 80                      if prefix is None or not prefix:
 81                          raise TypeError, \
 82                              u"empty namespace prefix is not supported in XPath"
 83                      if ns_uri is None or not ns_uri:
 84                          raise TypeError, \
 85                              u"setting default namespace is not supported in XPath"
 86                      prefix_utf = self._to_utf(prefix)
 87                      ns_uri_utf = self._to_utf(ns_uri)
 88                      ns.append( (prefix_utf, ns_uri_utf) )
 89                  namespaces = ns
 90              else:
 91                  namespaces = None
 92  
 93          self._doc        = None
 94          self._exc        = _ExceptionContext()
 95          self._extensions = extensions
 96          self._namespaces = namespaces
 97          self._temp_refs  = _TempStore()
 98          self._temp_documents  = set()
 99          self._build_smart_strings = build_smart_strings
100  
101          if enable_regexp:
102              _regexp = _ExsltRegExp()
103              _regexp._register_in_context(self)
104  
105      cdef _BaseContext _copy(self):
106          cdef _BaseContext context
107          if self._namespaces is not None:
108              namespaces = self._namespaces[:]
109          else:
110              namespaces = None
111          context = self.__class__(namespaces, None, self._error_log, False,
112                                   self._build_smart_strings)
113          if self._extensions is not None:
114              context._extensions = self._extensions.copy()
115          return context
116  
117      cdef bytes _to_utf(self, s):
118          u"Convert to UTF-8 and keep a reference to the encoded string"
119          cdef python.PyObject* dict_result
120          if s is None:
121              return None
122          dict_result = python.PyDict_GetItem(self._utf_refs, s)
123          if dict_result is not NULL:
124              return <bytes>dict_result
125          utf = _utf8(s)
126          self._utf_refs[s] = utf
127          if python.IS_PYPY:
128              # use C level refs, PyPy refs are not enough!
129              python.Py_INCREF(utf)
130          return utf
131  
132      cdef void _set_xpath_context(self, xpath.xmlXPathContext* xpathCtxt):
133          self._xpathCtxt = xpathCtxt
134          xpathCtxt.userData = <void*>self
135          xpathCtxt.error = _receiveXPathError
136  
137      @cython.final
138      cdef _register_context(self, _Document doc):
139          self._doc = doc
140          self._exc.clear()
141  
142      @cython.final
143      cdef _cleanup_context(self):
144          #xpath.xmlXPathRegisteredNsCleanup(self._xpathCtxt)
145          #self.unregisterGlobalNamespaces()
146          if python.IS_PYPY:
147              # clean up double refs in PyPy (see "_to_utf()" method)
148              for ref in self._utf_refs.itervalues():
149                  python.Py_DECREF(ref)
150          self._utf_refs.clear()
151          self._eval_context_dict = None
152          self._doc = None
153  
154      @cython.final
155      cdef _release_context(self):
156          if self._xpathCtxt is not NULL:
157              self._xpathCtxt.userData = NULL
158              self._xpathCtxt = NULL
159  
160      # namespaces (internal UTF-8 methods with leading '_')
161  
162      cdef addNamespace(self, prefix, ns_uri):
163          cdef list namespaces
164          if prefix is None:
165              raise TypeError, u"empty prefix is not supported in XPath"
166          prefix_utf = self._to_utf(prefix)
167          ns_uri_utf = self._to_utf(ns_uri)
168          new_item = (prefix_utf, ns_uri_utf)
169          if self._namespaces is None:
170              self._namespaces = [new_item]
171          else:
172              namespaces = []
173              for item in self._namespaces:
174                  if item[0] == prefix_utf:
175                      item = new_item
176                      new_item = None
177                  namespaces.append(item)
178              if new_item is not None:
179                  namespaces.append(new_item)
180              self._namespaces = namespaces
181          if self._xpathCtxt is not NULL:
182              xpath.xmlXPathRegisterNs(
183                  self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
184  
185      cdef registerNamespace(self, prefix, ns_uri):
186          if prefix is None:
187              raise TypeError, u"empty prefix is not supported in XPath"
188          prefix_utf = self._to_utf(prefix)
189          ns_uri_utf = self._to_utf(ns_uri)
190          self._global_namespaces.append(prefix_utf)
191          xpath.xmlXPathRegisterNs(self._xpathCtxt,
192                                   _xcstr(prefix_utf), _xcstr(ns_uri_utf))
193  
194      cdef registerLocalNamespaces(self):
195          if self._namespaces is None:
196              return
197          for prefix_utf, ns_uri_utf in self._namespaces:
198              xpath.xmlXPathRegisterNs(
199                  self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
200  
201      cdef registerGlobalNamespaces(self):
202          cdef list ns_prefixes = _find_all_extension_prefixes()
203          if python.PyList_GET_SIZE(ns_prefixes) > 0:
204              for prefix_utf, ns_uri_utf in ns_prefixes:
205                  self._global_namespaces.append(prefix_utf)
206                  xpath.xmlXPathRegisterNs(
207                      self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
208  
209      cdef unregisterGlobalNamespaces(self):
210          if python.PyList_GET_SIZE(self._global_namespaces) > 0:
211              for prefix_utf in self._global_namespaces:
212                  xpath.xmlXPathRegisterNs(self._xpathCtxt,
213                                           _xcstr(prefix_utf), NULL)
214              del self._global_namespaces[:]
215      
216      cdef void _unregisterNamespace(self, prefix_utf):
217          xpath.xmlXPathRegisterNs(self._xpathCtxt,
218                                   _xcstr(prefix_utf), NULL)
219      
220      # extension functions
221  
222      cdef int _addLocalExtensionFunction(self, ns_utf, name_utf, function) except -1:
223          if self._extensions is None:
224              self._extensions = {}
225          self._extensions[(ns_utf, name_utf)] = function
226          return 0
227  
228      cdef registerGlobalFunctions(self, void* ctxt,
229                                   _register_function reg_func):
230          cdef python.PyObject* dict_result
231          cdef dict d
232          for ns_utf, ns_functions in __FUNCTION_NAMESPACE_REGISTRIES.iteritems():
233              dict_result = python.PyDict_GetItem(
234                  self._function_cache, ns_utf)
235              if dict_result is not NULL:
236                  d = <dict>dict_result
237              else:
238                  d = {}
239                  self._function_cache[ns_utf] = d
240              for name_utf, function in ns_functions.iteritems():
241                  d[name_utf] = function
242                  reg_func(ctxt, name_utf, ns_utf)
243  
244      cdef registerLocalFunctions(self, void* ctxt,
245                                  _register_function reg_func):
246          cdef python.PyObject* dict_result
247          cdef dict d
248          if self._extensions is None:
249              return # done
250          last_ns = None
251          d = None
252          for (ns_utf, name_utf), function in self._extensions.iteritems():
253              if ns_utf is not last_ns or d is None:
254                  last_ns = ns_utf
255                  dict_result = python.PyDict_GetItem(
256                      self._function_cache, ns_utf)
257                  if dict_result is not NULL:
258                      d = <dict>dict_result
259                  else:
260                      d = {}
261                      self._function_cache[ns_utf] = d
262              d[name_utf] = function
263              reg_func(ctxt, name_utf, ns_utf)
264  
265      cdef unregisterAllFunctions(self, void* ctxt,
266                                        _register_function unreg_func):
267          for ns_utf, functions in self._function_cache.iteritems():
268              for name_utf in functions:
269                  unreg_func(ctxt, name_utf, ns_utf)
270  
271      cdef unregisterGlobalFunctions(self, void* ctxt,
272                                           _register_function unreg_func):
273          for ns_utf, functions in self._function_cache.items():
274              for name_utf in functions:
275                  if self._extensions is None or \
276                         (ns_utf, name_utf) not in self._extensions:
277                      unreg_func(ctxt, name_utf, ns_utf)
278  
279      @cython.final
280      cdef _find_cached_function(self, const_xmlChar* c_ns_uri, const_xmlChar* c_name):
281          u"""Lookup an extension function in the cache and return it.
282  
283          Parameters: c_ns_uri may be NULL, c_name must not be NULL
284          """
285          cdef python.PyObject* c_dict
286          cdef python.PyObject* dict_result
287          c_dict = python.PyDict_GetItem(
288              self._function_cache, None if c_ns_uri is NULL else c_ns_uri)
289          if c_dict is not NULL:
290              dict_result = python.PyDict_GetItem(
291                  <object>c_dict, <unsigned char*>c_name)
292              if dict_result is not NULL:
293                  return <object>dict_result
294          return None
295  
296      # Python access to the XPath context for extension functions
297  
298      @property
299      def context_node(self):
300          cdef xmlNode* c_node
301          if self._xpathCtxt is NULL:
302              raise XPathError, \
303                  u"XPath context is only usable during the evaluation"
304          c_node = self._xpathCtxt.node
305          if c_node is NULL:
306              raise XPathError, u"no context node"
307          if c_node.doc != self._xpathCtxt.doc:
308              raise XPathError, \
309                  u"document-external context nodes are not supported"
310          if self._doc is None:
311              raise XPathError, u"document context is missing"
312          return _elementFactory(self._doc, c_node)
313  
314      @property
315      def eval_context(self):
316          if self._eval_context_dict is None:
317              self._eval_context_dict = {}
318          return self._eval_context_dict
319  
320      # Python reference keeping during XPath function evaluation
321  
322      @cython.final
323      cdef _release_temp_refs(self):
324          u"Free temporarily referenced objects from this context."
325          self._temp_refs.clear()
326          self._temp_documents.clear()
327  
328      @cython.final
329      cdef _hold(self, obj):
330          u"""A way to temporarily hold references to nodes in the evaluator.
331  
332          This is needed because otherwise nodes created in XPath extension
333          functions would be reference counted too soon, during the XPath
334          evaluation.  This is most important in the case of exceptions.
335          """
336          cdef _Element element
337          if isinstance(obj, _Element):
338              self._temp_refs.add(obj)
339              self._temp_documents.add((<_Element>obj)._doc)
340              return
341          elif _isString(obj) or not python.PySequence_Check(obj):
342              return
343          for o in obj:
344              if isinstance(o, _Element):
345                  #print "Holding element:", <int>element._c_node
346                  self._temp_refs.add(o)
347                  #print "Holding document:", <int>element._doc._c_doc
348                  self._temp_documents.add((<_Element>o)._doc)
349  
350      @cython.final
351      cdef _Document _findDocumentForNode(self, xmlNode* c_node):
352          u"""If an XPath expression returns an element from a different
353          document than the current context document, we call this to
354          see if it was possibly created by an extension and is a known
355          document instance.
356          """
357          cdef _Document doc
358          for doc in self._temp_documents:
359              if doc is not None and doc._c_doc is c_node.doc:
360                  return doc
361          return None
362  
363  
364  # libxml2 keeps these error messages in a static array in its code
365  # and doesn't give us access to them ...
366  
367  cdef tuple LIBXML2_XPATH_ERROR_MESSAGES = (
368      b"Ok",
369      b"Number encoding",
370      b"Unfinished literal",
371      b"Start of literal",
372      b"Expected $ for variable reference",
373      b"Undefined variable",
374      b"Invalid predicate",
375      b"Invalid expression",
376      b"Missing closing curly brace",
377      b"Unregistered function",
378      b"Invalid operand",
379      b"Invalid type",
380      b"Invalid number of arguments",
381      b"Invalid context size",
382      b"Invalid context position",
383      b"Memory allocation error",
384      b"Syntax error",
385      b"Resource error",
386      b"Sub resource error",
387      b"Undefined namespace prefix",
388      b"Encoding error",
389      b"Char out of XML range",
390      b"Invalid or incomplete context",
391      b"Stack usage error",
392      b"Forbidden variable\n",
393      b"?? Unknown error ??\n",
394  )
395  
396  cdef void _forwardXPathError(void* c_ctxt, xmlerror.xmlError* c_error) with gil:
397      cdef xmlerror.xmlError error
398      cdef int xpath_code
399      if c_error.message is not NULL:
400          error.message = c_error.message
401      else:
402          xpath_code = c_error.code - xmlerror.XML_XPATH_EXPRESSION_OK
403          if 0 <= xpath_code < len(LIBXML2_XPATH_ERROR_MESSAGES):
404              error.message = _cstr(LIBXML2_XPATH_ERROR_MESSAGES[xpath_code])
405          else:
406              error.message = b"unknown error"
407      error.domain = c_error.domain
408      error.code = c_error.code
409      error.level = c_error.level
410      error.line = c_error.line
411      error.int2 = c_error.int1 # column
412      error.file = c_error.file
413      error.node = NULL
414  
415      (<_BaseContext>c_ctxt)._error_log._receive(&error)
416  
417  cdef void _receiveXPathError(void* c_context, xmlerror.xmlError* error) nogil:
418      if not __DEBUG:
419          return
420      if c_context is NULL:
421          _forwardError(NULL, error)
422      else:
423          _forwardXPathError(c_context, error)
424  
425  
426  def Extension(module, function_mapping=None, *, ns=None):
427      u"""Extension(module, function_mapping=None, ns=None)
428  
429      Build a dictionary of extension functions from the functions
430      defined in a module or the methods of an object.
431  
432      As second argument, you can pass an additional mapping of
433      attribute names to XPath function names, or a list of function
434      names that should be taken.
435  
436      The ``ns`` keyword argument accepts a namespace URI for the XPath
437      functions.
438      """
439      cdef dict functions = {}
440      if isinstance(function_mapping, dict):
441          for function_name, xpath_name in function_mapping.items():
442              functions[(ns, xpath_name)] = getattr(module, function_name)
443      else:
444          if function_mapping is None:
445              function_mapping = [ name for name in dir(module)
446                                   if not name.startswith(u'_') ]
447          for function_name in function_mapping:
448              functions[(ns, function_name)] = getattr(module, function_name)
449      return functions
450  
451  ################################################################################
452  # EXSLT regexp implementation
453  
454  @cython.final
455  @cython.internal
456  cdef class _ExsltRegExp:
457      cdef dict _compile_map
458      def __cinit__(self):
459          self._compile_map = {}
460  
461      cdef _make_string(self, value):
462          if _isString(value):
463              return value
464          elif isinstance(value, list):
465              # node set: take recursive text concatenation of first element
466              if python.PyList_GET_SIZE(value) == 0:
467                  return u''
468              firstnode = value[0]
469              if _isString(firstnode):
470                  return firstnode
471              elif isinstance(firstnode, _Element):
472                  c_text = tree.xmlNodeGetContent((<_Element>firstnode)._c_node)
473                  if c_text is NULL:
474                      raise MemoryError()
475                  try:
476                      return funicode(c_text)
477                  finally:
478                      tree.xmlFree(c_text)
479              else:
480                  return unicode(firstnode)
481          else:
482              return unicode(value)
483  
484      cdef _compile(self, rexp, ignore_case):
485          cdef python.PyObject* c_result
486          rexp = self._make_string(rexp)
487          key = (rexp, ignore_case)
488          c_result = python.PyDict_GetItem(self._compile_map, key)
489          if c_result is not NULL:
490              return <object>c_result
491          py_flags = re.UNICODE
492          if ignore_case:
493              py_flags = py_flags | re.IGNORECASE
494          rexp_compiled = re.compile(rexp, py_flags)
495          self._compile_map[key] = rexp_compiled
496          return rexp_compiled
497  
498      def test(self, ctxt, s, rexp, flags=u''):
499          flags = self._make_string(flags)
500          s = self._make_string(s)
501          rexpc = self._compile(rexp, u'i' in flags)
502          if rexpc.search(s) is None:
503              return False
504          else:
505              return True
506  
507      def match(self, ctxt, s, rexp, flags=u''):
508          cdef list result_list
509          flags = self._make_string(flags)
510          s = self._make_string(s)
511          rexpc = self._compile(rexp, u'i' in flags)
512          if u'g' in flags:
513              results = rexpc.findall(s)
514              if not results:
515                  return ()
516          else:
517              result = rexpc.search(s)
518              if not result:
519                  return ()
520              results = [ result.group() ]
521              results.extend( result.groups(u'') )
522          result_list = []
523          root = Element(u'matches')
524          join_groups = u''.join
525          for s_match in results:
526              if python.PyTuple_CheckExact(s_match):
527                  s_match = join_groups(s_match)
528              elem = SubElement(root, u'match')
529              elem.text = s_match
530              result_list.append(elem)
531          return result_list
532  
533      def replace(self, ctxt, s, rexp, flags, replacement):
534          replacement = self._make_string(replacement)
535          flags = self._make_string(flags)
536          s = self._make_string(s)
537          rexpc = self._compile(rexp, u'i' in flags)
538          if u'g' in flags:
539              count = 0
540          else:
541              count = 1
542          return rexpc.sub(replacement, s, count)
543  
544      cdef _register_in_context(self, _BaseContext context):
545          ns = b"http://exslt.org/regular-expressions"
546          context._addLocalExtensionFunction(ns, b"test",    self.test)
547          context._addLocalExtensionFunction(ns, b"match",   self.match)
548          context._addLocalExtensionFunction(ns, b"replace", self.replace)
549  
550  
551  ################################################################################
552  # helper functions
553  
554  cdef xpath.xmlXPathObject* _wrapXPathObject(object obj, _Document doc,
555                                              _BaseContext context) except NULL:
556      cdef xpath.xmlNodeSet* resultSet
557      cdef _Element fake_node = None
558      cdef xmlNode* c_node
559  
560      if isinstance(obj, unicode):
561          obj = _utf8(obj)
562      if isinstance(obj, bytes):
563          # libxml2 copies the string value
564          return xpath.xmlXPathNewCString(_cstr(obj))
565      if isinstance(obj, bool):
566          return xpath.xmlXPathNewBoolean(obj)
567      if python.PyNumber_Check(obj):
568          return xpath.xmlXPathNewFloat(obj)
569      if obj is None:
570          resultSet = xpath.xmlXPathNodeSetCreate(NULL)
571      elif isinstance(obj, _Element):
572          resultSet = xpath.xmlXPathNodeSetCreate((<_Element>obj)._c_node)
573      elif python.PySequence_Check(obj):
574          resultSet = xpath.xmlXPathNodeSetCreate(NULL)
575          try:
576              for value in obj:
577                  if isinstance(value, _Element):
578                      if context is not None:
579                          context._hold(value)
580                      xpath.xmlXPathNodeSetAdd(resultSet, (<_Element>value)._c_node)
581                  else:
582                      if context is None or doc is None:
583                          raise XPathResultError, \
584                                f"Non-Element values not supported at this point - got {value!r}"
585                      # support strings by appending text nodes to an Element
586                      if isinstance(value, unicode):
587                          value = _utf8(value)
588                      if isinstance(value, bytes):
589                          if fake_node is None:
590                              fake_node = _makeElement("text-root", NULL, doc, None,
591                                                       None, None, None, None, None)
592                              context._hold(fake_node)
593                          else:
594                              # append a comment node to keep the text nodes separate
595                              c_node = tree.xmlNewDocComment(doc._c_doc, <unsigned char*>"")
596                              if c_node is NULL:
597                                  raise MemoryError()
598                              tree.xmlAddChild(fake_node._c_node, c_node)
599                          context._hold(value)
600                          c_node = tree.xmlNewDocText(doc._c_doc, _xcstr(value))
601                          if c_node is NULL:
602                              raise MemoryError()
603                          tree.xmlAddChild(fake_node._c_node, c_node)
604                          xpath.xmlXPathNodeSetAdd(resultSet, c_node)
605                      else:
606                          raise XPathResultError, \
607                                f"This is not a supported node-set result: {value!r}"
608          except:
609              xpath.xmlXPathFreeNodeSet(resultSet)
610              raise
611      else:
612          raise XPathResultError, f"Unknown return type: {python._fqtypename(obj).decode('utf8')}"
613      return xpath.xmlXPathWrapNodeSet(resultSet)
614  
615  cdef object _unwrapXPathObject(xpath.xmlXPathObject* xpathObj,
616                                 _Document doc, _BaseContext context):
617      if xpathObj.type == xpath.XPATH_UNDEFINED:
618          raise XPathResultError, u"Undefined xpath result"
619      elif xpathObj.type == xpath.XPATH_NODESET:
620          return _createNodeSetResult(xpathObj, doc, context)
621      elif xpathObj.type == xpath.XPATH_BOOLEAN:
622          return xpathObj.boolval
623      elif xpathObj.type == xpath.XPATH_NUMBER:
624          return xpathObj.floatval
625      elif xpathObj.type == xpath.XPATH_STRING:
626          stringval = funicode(xpathObj.stringval)
627          if context._build_smart_strings:
628              stringval = _elementStringResultFactory(
629                  stringval, None, None, 0)
630          return stringval
631      elif xpathObj.type == xpath.XPATH_POINT:
632          raise NotImplementedError, u"XPATH_POINT"
633      elif xpathObj.type == xpath.XPATH_RANGE:
634          raise NotImplementedError, u"XPATH_RANGE"
635      elif xpathObj.type == xpath.XPATH_LOCATIONSET:
636          raise NotImplementedError, u"XPATH_LOCATIONSET"
637      elif xpathObj.type == xpath.XPATH_USERS:
638          raise NotImplementedError, u"XPATH_USERS"
639      elif xpathObj.type == xpath.XPATH_XSLT_TREE:
640          return _createNodeSetResult(xpathObj, doc, context)
641      else:
642          raise XPathResultError, f"Unknown xpath result {xpathObj.type}"
643  
644  cdef object _createNodeSetResult(xpath.xmlXPathObject* xpathObj, _Document doc,
645                                   _BaseContext context):
646      cdef xmlNode* c_node
647      cdef int i
648      cdef list result
649      result = []
650      if xpathObj.nodesetval is NULL:
651          return result
652      for i in range(xpathObj.nodesetval.nodeNr):
653          c_node = xpathObj.nodesetval.nodeTab[i]
654          _unpackNodeSetEntry(result, c_node, doc, context,
655                              xpathObj.type == xpath.XPATH_XSLT_TREE)
656      return result
657  
658  cdef _unpackNodeSetEntry(list results, xmlNode* c_node, _Document doc,
659                           _BaseContext context, bint is_fragment):
660      cdef xmlNode* c_child
661      if _isElement(c_node):
662          if c_node.doc != doc._c_doc and c_node.doc._private is NULL:
663              # XXX: works, but maybe not always the right thing to do?
664              # XPath: only runs when extensions create or copy trees
665              #        -> we store Python refs to these, so that is OK
666              # XSLT: can it leak when merging trees from multiple sources?
667              c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1)
668              # FIXME: call _instantiateElementFromXPath() instead?
669          results.append(
670              _fakeDocElementFactory(doc, c_node))
671      elif c_node.type == tree.XML_TEXT_NODE or \
672               c_node.type == tree.XML_CDATA_SECTION_NODE or \
673               c_node.type == tree.XML_ATTRIBUTE_NODE:
674          results.append(
675              _buildElementStringResult(doc, c_node, context))
676      elif c_node.type == tree.XML_NAMESPACE_DECL:
677          results.append( (funicodeOrNone((<xmlNs*>c_node).prefix),
678                           funicodeOrNone((<xmlNs*>c_node).href)) )
679      elif c_node.type == tree.XML_DOCUMENT_NODE or \
680              c_node.type == tree.XML_HTML_DOCUMENT_NODE:
681          # ignored for everything but result tree fragments
682          if is_fragment:
683              c_child = c_node.children
684              while c_child is not NULL:
685                  _unpackNodeSetEntry(results, c_child, doc, context, 0)
686                  c_child = c_child.next
687      elif c_node.type == tree.XML_XINCLUDE_START or \
688              c_node.type == tree.XML_XINCLUDE_END:
689          pass
690      else:
691          raise NotImplementedError, \
692              f"Not yet implemented result node type: {c_node.type}"
693  
694  cdef void _freeXPathObject(xpath.xmlXPathObject* xpathObj):
695      u"""Free the XPath object, but *never* free the *content* of node sets.
696      Python dealloc will do that for us.
697      """
698      if xpathObj.nodesetval is not NULL:
699          xpath.xmlXPathFreeNodeSet(xpathObj.nodesetval)
700          xpathObj.nodesetval = NULL
701      xpath.xmlXPathFreeObject(xpathObj)
702  
703  cdef _Element _instantiateElementFromXPath(xmlNode* c_node, _Document doc,
704                                             _BaseContext context):
705      # NOTE: this may copy the element - only call this when it can't leak
706      if c_node.doc != doc._c_doc and c_node.doc._private is NULL:
707          # not from the context document and not from a fake document
708          # either => may still be from a known document, e.g. one
709          # created by an extension function
710          node_doc = context._findDocumentForNode(c_node)
711          if node_doc is None:
712              # not from a known document at all! => can only make a
713              # safety copy here
714              c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1)
715          else:
716              doc = node_doc
717      return _fakeDocElementFactory(doc, c_node)
718  
719  ################################################################################
720  # special str/unicode subclasses
721  
722  @cython.final
723  cdef class _ElementUnicodeResult(unicode):
724      cdef _Element _parent
725      cdef readonly object attrname
726      cdef readonly bint is_tail
727      cdef readonly bint is_text
728      cdef readonly bint is_attribute
729  
730      def getparent(self):
731          return self._parent
732  
733  cdef object _PyElementUnicodeResult
734  if python.IS_PYPY:
735      class _PyElementUnicodeResult(unicode):
736          # we need to use a Python class here, or PyPy will crash on creation
737          # https://bitbucket.org/pypy/pypy/issues/2021/pypy3-pytype_ready-crashes-for-extension
738          def getparent(self):
739              return self._parent
740  
741  class _ElementStringResult(bytes):
742      # we need to use a Python class here, bytes cannot be C-subclassed
743      # in Pyrex/Cython
744      def getparent(self):
745          return self._parent
746  
747  cdef object _elementStringResultFactory(string_value, _Element parent,
748                                          attrname, bint is_tail):
749      cdef _ElementUnicodeResult uresult
750      cdef bint is_text
751      cdef bint is_attribute = attrname is not None
752      if parent is None:
753          is_text = 0
754      else:
755          is_text = not (is_tail or is_attribute)
756  
757      if type(string_value) is bytes:
758          result = _ElementStringResult(string_value)
759          result._parent = parent
760          result.is_attribute = is_attribute
761          result.is_tail = is_tail
762          result.is_text = is_text
763          result.attrname = attrname
764          return result
765      elif python.IS_PYPY:
766          result = _PyElementUnicodeResult(string_value)
767          result._parent = parent
768          result.is_attribute = is_attribute
769          result.is_tail = is_tail
770          result.is_text = is_text
771          result.attrname = attrname
772          return result
773      else:
774          uresult = _ElementUnicodeResult(string_value)
775          uresult._parent = parent
776          uresult.is_attribute = is_attribute
777          uresult.is_tail = is_tail
778          uresult.is_text = is_text
779          uresult.attrname = attrname
780          return uresult
781  
782  cdef object _buildElementStringResult(_Document doc, xmlNode* c_node,
783                                        _BaseContext context):
784      cdef _Element parent = None
785      cdef object attrname = None
786      cdef xmlNode* c_element
787      cdef bint is_tail
788  
789      if c_node.type == tree.XML_ATTRIBUTE_NODE:
790          attrname = _namespacedName(c_node)
791          is_tail = 0
792          s = tree.xmlNodeGetContent(c_node)
793          try:
794              value = funicode(s)
795          finally:
796              tree.xmlFree(s)
797          c_element = NULL
798      else:
799          #assert c_node.type == tree.XML_TEXT_NODE or c_node.type == tree.XML_CDATA_SECTION_NODE, "invalid node type"
800          # may be tail text or normal text
801          value = funicode(c_node.content)
802          c_element = _previousElement(c_node)
803          is_tail = c_element is not NULL
804  
805      if not context._build_smart_strings:
806          return value
807  
808      if c_element is NULL:
809          # non-tail text or attribute text
810          c_element = c_node.parent
811          while c_element is not NULL and not _isElement(c_element):
812              c_element = c_element.parent
813  
814      if c_element is not NULL:
815          parent = _instantiateElementFromXPath(c_element, doc, context)
816  
817      return _elementStringResultFactory(
818          value, parent, attrname, is_tail)
819  
820  ################################################################################
821  # callbacks for XPath/XSLT extension functions
822  
823  cdef void _extension_function_call(_BaseContext context, function,
824                                     xpath.xmlXPathParserContext* ctxt, int nargs):
825      cdef _Document doc
826      cdef xpath.xmlXPathObject* obj
827      cdef list args
828      cdef int i
829      doc = context._doc
830      try:
831          args = []
832          for i in range(nargs):
833              obj = xpath.valuePop(ctxt)
834              o = _unwrapXPathObject(obj, doc, context)
835              _freeXPathObject(obj)
836              args.append(o)
837          args.reverse()
838  
839          res = function(context, *args)
840          # wrap result for XPath consumption
841          obj = _wrapXPathObject(res, doc, context)
842          # prevent Python from deallocating elements handed to libxml2
843          context._hold(res)
844          xpath.valuePush(ctxt, obj)
845      except:
846          xpath.xmlXPathErr(ctxt, xpath.XPATH_EXPR_ERROR)
847          context._exc._store_raised()
848      finally:
849          return  # swallow any further exceptions
850  
851  # lookup the function by name and call it
852  
853  cdef void _xpath_function_call(xpath.xmlXPathParserContext* ctxt,
854                                 int nargs) with gil:
855      cdef _BaseContext context
856      cdef xpath.xmlXPathContext* rctxt = ctxt.context
857      context = <_BaseContext> rctxt.userData
858      try:
859          function = context._find_cached_function(rctxt.functionURI, rctxt.function)
860          if function is not None:
861              _extension_function_call(context, function, ctxt, nargs)
862          else:
863              xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR)
864              context._exc._store_exception(XPathFunctionError(
865                  f"XPath function '{_namespacedNameFromNsName(rctxt.functionURI, rctxt.function)}' not found"))
866      except:
867          # may not be the right error, but we need to tell libxml2 *something*
868          xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR)
869          context._exc._store_raised()
870      finally:
871          return  # swallow any further exceptions